feat: scaffold realtime Korean voice assistant bot
This commit is contained in:
16
.env.example
Normal file
16
.env.example
Normal file
@@ -0,0 +1,16 @@
|
||||
DISCORD_BOT_TOKEN=
|
||||
DISCORD_APPLICATION_ID=
|
||||
DISCORD_COMMAND_GUILD_ID=
|
||||
|
||||
OPENAI_API_KEY=
|
||||
OPENAI_MODEL=gpt-5.4-mini
|
||||
|
||||
ELEVENLABS_API_KEY=
|
||||
ELEVENLABS_VOICE_ID=
|
||||
ELEVENLABS_STT_MODEL=scribe_v2_realtime
|
||||
ELEVENLABS_TTS_MODEL=eleven_flash_v2_5
|
||||
|
||||
BOT_DEFAULT_LANGUAGE=ko
|
||||
MAX_CONVERSATION_TURNS=12
|
||||
DEBUG_TEXT_EVENTS=false
|
||||
LOG_LEVEL=info
|
||||
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
node_modules
|
||||
dist
|
||||
.env
|
||||
82
README.md
Normal file
82
README.md
Normal file
@@ -0,0 +1,82 @@
|
||||
# realtime_voice_bot
|
||||
|
||||
디스코드 음성 채널에서 여러 사용자의 음성을 개별로 받아 한국어로 인식하고, LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다.
|
||||
|
||||
## 현재 구현 범위
|
||||
|
||||
- Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say`
|
||||
- `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신
|
||||
- 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리
|
||||
- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지
|
||||
- ElevenLabs Scribe Realtime WebSocket으로 발화 단위 STT
|
||||
- OpenAI Responses API로 짧은 한국어 답변 생성
|
||||
- ElevenLabs Flash v2.5 스트리밍 TTS
|
||||
- 채널 단위 단일 재생 큐
|
||||
- 사용자 발화 시작 시 현재 TTS와 대기열 중단(barge-in)
|
||||
|
||||
## 권장 환경
|
||||
|
||||
- Bun `1.3+`
|
||||
- Node.js `22.12+`
|
||||
- Discord bot with Voice permissions
|
||||
- ElevenLabs API key + 사용할 Voice ID
|
||||
- OpenAI API key
|
||||
|
||||
## 환경 변수
|
||||
|
||||
`.env.example`를 참고해서 `.env`를 채우면 됩니다.
|
||||
|
||||
필수:
|
||||
|
||||
- `DISCORD_BOT_TOKEN`
|
||||
- `DISCORD_APPLICATION_ID`
|
||||
- `OPENAI_API_KEY`
|
||||
- `ELEVENLABS_API_KEY`
|
||||
- `ELEVENLABS_VOICE_ID`
|
||||
|
||||
선택:
|
||||
|
||||
- `DISCORD_COMMAND_GUILD_ID`
|
||||
- 테스트 서버에만 slash command를 즉시 반영하려면 설정
|
||||
- `OPENAI_MODEL`
|
||||
- 기본값: `gpt-5.4-mini`
|
||||
- `ELEVENLABS_STT_MODEL`
|
||||
- 기본값: `scribe_v2_realtime`
|
||||
- `ELEVENLABS_TTS_MODEL`
|
||||
- 기본값: `eleven_flash_v2_5`
|
||||
- `DEBUG_TEXT_EVENTS`
|
||||
- `true`면 명령을 실행한 텍스트 채널에 transcript/reply를 같이 올림
|
||||
|
||||
## 실행
|
||||
|
||||
```bash
|
||||
bun install
|
||||
bun run start
|
||||
```
|
||||
|
||||
개발 모드:
|
||||
|
||||
```bash
|
||||
bun run dev
|
||||
```
|
||||
|
||||
타입 체크:
|
||||
|
||||
```bash
|
||||
bun run check
|
||||
```
|
||||
|
||||
## 사용 흐름
|
||||
|
||||
1. 봇을 서버에 초대하고 음성 권한을 부여합니다.
|
||||
2. 음성 채널에 들어갑니다.
|
||||
3. 텍스트 채널에서 `/join` 실행
|
||||
4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다.
|
||||
5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다.
|
||||
|
||||
## 설계 메모
|
||||
|
||||
- 입력은 유저별 병렬 처리
|
||||
- 출력은 길드 세션당 단일 큐
|
||||
- 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함
|
||||
- 최소 프로토타입이므로 Deepgram 대체 STT, 장기 메모리, 고급 명령 라우팅은 아직 포함하지 않았습니다.
|
||||
225
bun.lock
Normal file
225
bun.lock
Normal file
@@ -0,0 +1,225 @@
|
||||
{
|
||||
"lockfileVersion": 1,
|
||||
"configVersion": 1,
|
||||
"workspaces": {
|
||||
"": {
|
||||
"name": "realtime_voice_bot",
|
||||
"dependencies": {
|
||||
"@discordjs/voice": "^0.19.2",
|
||||
"avr-vad": "^1.0.10",
|
||||
"discord.js": "^14.26.3",
|
||||
"dotenv": "^17.4.2",
|
||||
"ffmpeg-static": "^5.3.0",
|
||||
"openai": "^6.35.0",
|
||||
"opusscript": "^0.1.1",
|
||||
"prism-media": "^1.3.5",
|
||||
"ws": "^8.20.0",
|
||||
"zod": "^4.3.6",
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^25.6.0",
|
||||
"typescript": "^6.0.3",
|
||||
},
|
||||
},
|
||||
},
|
||||
"trustedDependencies": [
|
||||
"onnxruntime-node",
|
||||
],
|
||||
"packages": {
|
||||
"@derhuerst/http-basic": ["@derhuerst/http-basic@8.2.4", "", { "dependencies": { "caseless": "^0.12.0", "concat-stream": "^2.0.0", "http-response-object": "^3.0.1", "parse-cache-control": "^1.0.1" } }, "sha512-F9rL9k9Xjf5blCz8HsJRO4diy111cayL2vkY2XE4r4t3n0yPXVYy3KD3nJ1qbrSn9743UWSXH4IwuCa/HWlGFw=="],
|
||||
|
||||
"@discordjs/builders": ["@discordjs/builders@1.14.1", "", { "dependencies": { "@discordjs/formatters": "^0.6.2", "@discordjs/util": "^1.2.0", "@sapphire/shapeshift": "^4.0.0", "discord-api-types": "^0.38.40", "fast-deep-equal": "^3.1.3", "ts-mixer": "^6.0.4", "tslib": "^2.6.3" } }, "sha512-gSKkhXLqs96TCzk66VZuHHl8z2bQMJFGwrXC0f33ngK+FLNau4hU1PYny3DNJfNdSH+gVMzE85/d5FQ2BpcNwQ=="],
|
||||
|
||||
"@discordjs/collection": ["@discordjs/collection@1.5.3", "", {}, "sha512-SVb428OMd3WO1paV3rm6tSjM4wC+Kecaa1EUGX7vc6/fddvw/6lg90z4QtCqm21zvVe92vMMDt9+DkIvjXImQQ=="],
|
||||
|
||||
"@discordjs/formatters": ["@discordjs/formatters@0.6.2", "", { "dependencies": { "discord-api-types": "^0.38.33" } }, "sha512-y4UPwWhH6vChKRkGdMB4odasUbHOUwy7KL+OVwF86PvT6QVOwElx+TiI1/6kcmcEe+g5YRXJFiXSXUdabqZOvQ=="],
|
||||
|
||||
"@discordjs/rest": ["@discordjs/rest@2.6.1", "", { "dependencies": { "@discordjs/collection": "^2.1.1", "@discordjs/util": "^1.2.0", "@sapphire/async-queue": "^1.5.3", "@sapphire/snowflake": "^3.5.5", "@vladfrangu/async_event_emitter": "^2.4.6", "discord-api-types": "^0.38.40", "magic-bytes.js": "^1.13.0", "tslib": "^2.6.3", "undici": "6.24.1" } }, "sha512-wwQdgjeaoYFiaG+atbqx6aJDpqW7JHAo0HrQkBTbYzM3/PJ3GweQIpgElNcGZ26DCUOXMyawYd0YF7vtr+fZXg=="],
|
||||
|
||||
"@discordjs/util": ["@discordjs/util@1.2.0", "", { "dependencies": { "discord-api-types": "^0.38.33" } }, "sha512-3LKP7F2+atl9vJFhaBjn4nOaSWahZ/yWjOvA4e5pnXkt2qyXRCHLxoBQy81GFtLGCq7K9lPm9R517M1U+/90Qg=="],
|
||||
|
||||
"@discordjs/voice": ["@discordjs/voice@0.19.2", "", { "dependencies": { "@snazzah/davey": "^0.1.9", "@types/ws": "^8.18.1", "discord-api-types": "^0.38.41", "prism-media": "^1.3.5", "tslib": "^2.8.1", "ws": "^8.19.0" } }, "sha512-3yJ255e4ag3wfZu/DSxeOZK1UtnqNxnspmLaQetGT0pDkThNZoHs+Zg6dgZZ19JEVomXygvfHn9lNpICZuYtEA=="],
|
||||
|
||||
"@discordjs/ws": ["@discordjs/ws@1.2.3", "", { "dependencies": { "@discordjs/collection": "^2.1.0", "@discordjs/rest": "^2.5.1", "@discordjs/util": "^1.1.0", "@sapphire/async-queue": "^1.5.2", "@types/ws": "^8.5.10", "@vladfrangu/async_event_emitter": "^2.2.4", "discord-api-types": "^0.38.1", "tslib": "^2.6.2", "ws": "^8.17.0" } }, "sha512-wPlQDxEmlDg5IxhJPuxXr3Vy9AjYq5xCvFWGJyD7w7Np8ZGu+Mc+97LCoEc/+AYCo2IDpKioiH0/c/mj5ZR9Uw=="],
|
||||
|
||||
"@emnapi/core": ["@emnapi/core@1.10.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.1", "tslib": "^2.4.0" } }, "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw=="],
|
||||
|
||||
"@emnapi/runtime": ["@emnapi/runtime@1.10.0", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA=="],
|
||||
|
||||
"@emnapi/wasi-threads": ["@emnapi/wasi-threads@1.2.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w=="],
|
||||
|
||||
"@napi-rs/wasm-runtime": ["@napi-rs/wasm-runtime@1.1.4", "", { "dependencies": { "@tybys/wasm-util": "^0.10.1" }, "peerDependencies": { "@emnapi/core": "^1.7.1", "@emnapi/runtime": "^1.7.1" } }, "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow=="],
|
||||
|
||||
"@sapphire/async-queue": ["@sapphire/async-queue@1.5.5", "", {}, "sha512-cvGzxbba6sav2zZkH8GPf2oGk9yYoD5qrNWdu9fRehifgnFZJMV+nuy2nON2roRO4yQQ+v7MK/Pktl/HgfsUXg=="],
|
||||
|
||||
"@sapphire/shapeshift": ["@sapphire/shapeshift@4.0.0", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "lodash": "^4.17.21" } }, "sha512-d9dUmWVA7MMiKobL3VpLF8P2aeanRTu6ypG2OIaEv/ZHH/SUQ2iHOVyi5wAPjQ+HmnMuL0whK9ez8I/raWbtIg=="],
|
||||
|
||||
"@sapphire/snowflake": ["@sapphire/snowflake@3.5.3", "", {}, "sha512-jjmJywLAFoWeBi1W7994zZyiNWPIiqRRNAmSERxyg93xRGzNYvGjlZ0gR6x0F4gPRi2+0O6S71kOZYyr3cxaIQ=="],
|
||||
|
||||
"@snazzah/davey": ["@snazzah/davey@0.1.11", "", { "optionalDependencies": { "@snazzah/davey-android-arm-eabi": "0.1.11", "@snazzah/davey-android-arm64": "0.1.11", "@snazzah/davey-darwin-arm64": "0.1.11", "@snazzah/davey-darwin-x64": "0.1.11", "@snazzah/davey-freebsd-x64": "0.1.11", "@snazzah/davey-linux-arm-gnueabihf": "0.1.11", "@snazzah/davey-linux-arm64-gnu": "0.1.11", "@snazzah/davey-linux-arm64-musl": "0.1.11", "@snazzah/davey-linux-x64-gnu": "0.1.11", "@snazzah/davey-linux-x64-musl": "0.1.11", "@snazzah/davey-wasm32-wasi": "0.1.11", "@snazzah/davey-win32-arm64-msvc": "0.1.11", "@snazzah/davey-win32-ia32-msvc": "0.1.11", "@snazzah/davey-win32-x64-msvc": "0.1.11" } }, "sha512-oBN+msHzPnm1M5DDx3wVD7iBwpNXFUtkh2MrAbUJu0OhKjliLChi28hq++mu1+qdMpAVQO5JKAvQQxYVbyneiw=="],
|
||||
|
||||
"@snazzah/davey-android-arm-eabi": ["@snazzah/davey-android-arm-eabi@0.1.11", "", { "os": "android", "cpu": "arm" }, "sha512-T1RYbNYKN6tLOcGIDKJd8OI6FBSEemwL7DOYdTMmhqfhhMr3YVN8WOhfoxGg63OcnpTN2e2c5tdY2bAx25RmQQ=="],
|
||||
|
||||
"@snazzah/davey-android-arm64": ["@snazzah/davey-android-arm64@0.1.11", "", { "os": "android", "cpu": "arm64" }, "sha512-ksJn/x2VU8h6w9eku1HT96ugSRZ7lKVkKNKbFleaFN+U99DJaPM+gMu2YvnFU4V54HR06ZBnRihnVG6VLXQpDw=="],
|
||||
|
||||
"@snazzah/davey-darwin-arm64": ["@snazzah/davey-darwin-arm64@0.1.11", "", { "os": "darwin", "cpu": "arm64" }, "sha512-E1d7PbaaVMO3Lj9EiAPqOVbuV0xg5+PsHzHH097DDXiD1+zUDXvJaTnUWsnm5z50pJniHpi4GtaYmk+ieB/guA=="],
|
||||
|
||||
"@snazzah/davey-darwin-x64": ["@snazzah/davey-darwin-x64@0.1.11", "", { "os": "darwin", "cpu": "x64" }, "sha512-Tl4TI/LTmgJZepgbgVMYDi8RqlAkPtPg1OEBPl7a9Tn3AwR36Vs6lyIT1cs/lGy/ds/+B+mKI4rPObN1cyILTw=="],
|
||||
|
||||
"@snazzah/davey-freebsd-x64": ["@snazzah/davey-freebsd-x64@0.1.11", "", { "os": "freebsd", "cpu": "x64" }, "sha512-T8Iw9FXkuI1T+YBAFzh9v/TXf9IOTOSqnd/BFpTRTrlW72PR2lhIidzSmg027VxO7r5pX47iFwiOkb9I/NU/EA=="],
|
||||
|
||||
"@snazzah/davey-linux-arm-gnueabihf": ["@snazzah/davey-linux-arm-gnueabihf@0.1.11", "", { "os": "linux", "cpu": "arm" }, "sha512-1Txj+8pqA8uq/OGtaUaBFWAPnNMQzFgIywj0iA7EI4xZl+mab48/pv+YZ1pNb/suC6ynsW44oB9efiXSdcUAgA=="],
|
||||
|
||||
"@snazzah/davey-linux-arm64-gnu": ["@snazzah/davey-linux-arm64-gnu@0.1.11", "", { "os": "linux", "cpu": "arm64" }, "sha512-ERzF5nM/IYW1BcN3wLXpEwBCGLFf0kGJUVhaV6yfiInz0tkU8UmvrrgpaMaACfMjIhfWdq5CcX+aTkXo/saNcg=="],
|
||||
|
||||
"@snazzah/davey-linux-arm64-musl": ["@snazzah/davey-linux-arm64-musl@0.1.11", "", { "os": "linux", "cpu": "arm64" }, "sha512-e6pX6Hiabtz99q+H/YHNkm9JVlpqN8HGh0qPib8G2+UY4/SSH8WvqWipk3v581dMy2oyCHt7MOoY1aU1P1N/xA=="],
|
||||
|
||||
"@snazzah/davey-linux-x64-gnu": ["@snazzah/davey-linux-x64-gnu@0.1.11", "", { "os": "linux", "cpu": "x64" }, "sha512-TW5bSoqChOJMbvsDb4wAATYrxmAXuNnse7wFNVSAJUaZKSeRfZbu3UAiPWSNn7GwLwSfU6hg322KZUn8IWCuvg=="],
|
||||
|
||||
"@snazzah/davey-linux-x64-musl": ["@snazzah/davey-linux-x64-musl@0.1.11", "", { "os": "linux", "cpu": "x64" }, "sha512-5j6Pmc+Wzv5lSxVP6quA7teYRJXibkZqQyYGfTDnTsUOO5dPpcojpqlXlkhyvsA1OAQTj4uxbOCciN3cVWwzug=="],
|
||||
|
||||
"@snazzah/davey-wasm32-wasi": ["@snazzah/davey-wasm32-wasi@0.1.11", "", { "dependencies": { "@napi-rs/wasm-runtime": "^1.1.2" }, "cpu": "none" }, "sha512-rKOwZ/0J8lp+4VEyOdMDBRP9KR+PksZpa9V1Qn0veMzy4FqTVKthkxwGqewheFe0SFg9fdvt798l/PBFrfDeZw=="],
|
||||
|
||||
"@snazzah/davey-win32-arm64-msvc": ["@snazzah/davey-win32-arm64-msvc@0.1.11", "", { "os": "win32", "cpu": "arm64" }, "sha512-5fptJU4tX901m3mj0SHiBljMrPT4ZEsynbBhR7bK1yn9TY1jjyhN8EFi7QF5IWtUEni+0mia2BCMHZ5ZkmFZqQ=="],
|
||||
|
||||
"@snazzah/davey-win32-ia32-msvc": ["@snazzah/davey-win32-ia32-msvc@0.1.11", "", { "os": "win32", "cpu": "ia32" }, "sha512-ualexn8SeLsiMHhWfzVrzRcjHgcBapg++FPaVgJJxoh2S/jCRiklXOu3luqIZdJdNKvhe2V9SwO/cImPeIIBKw=="],
|
||||
|
||||
"@snazzah/davey-win32-x64-msvc": ["@snazzah/davey-win32-x64-msvc@0.1.11", "", { "os": "win32", "cpu": "x64" }, "sha512-muNhc8UKXtknzsH/w4AIkbPR2I8BuvApn0pDXar0IEvY8PCjqU/M8MPbOOEYwQVvQRMwVTgExtxzrkBPSXB4nA=="],
|
||||
|
||||
"@tybys/wasm-util": ["@tybys/wasm-util@0.10.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg=="],
|
||||
|
||||
"@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="],
|
||||
|
||||
"@types/ws": ["@types/ws@8.18.1", "", { "dependencies": { "@types/node": "*" } }, "sha512-ThVF6DCVhA8kUGy+aazFQ4kXQ7E1Ty7A3ypFOe0IcJV8O/M511G99AW24irKrW56Wt44yG9+ij8FaqoBGkuBXg=="],
|
||||
|
||||
"@vladfrangu/async_event_emitter": ["@vladfrangu/async_event_emitter@2.4.7", "", {}, "sha512-Xfe6rpCTxSxfbswi/W/Pz7zp1WWSNn4A0eW4mLkQUewCrXXtMj31lCg+iQyTkh/CkusZSq9eDflu7tjEDXUY6g=="],
|
||||
|
||||
"adm-zip": ["adm-zip@0.5.17", "", {}, "sha512-+Ut8d9LLqwEvHHJl1+PIHqoyDxFgVN847JTVM3Izi3xHDWPE4UtzzXysMZQs64DMcrJfBeS/uoEP4AD3HQHnQQ=="],
|
||||
|
||||
"agent-base": ["agent-base@6.0.2", "", { "dependencies": { "debug": "4" } }, "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ=="],
|
||||
|
||||
"avr-vad": ["avr-vad@1.0.10", "", { "dependencies": { "onnxruntime-node": "^1.22.0-rev" } }, "sha512-gM8SiQIebujfKMfy5w74tRPH+Fg78CMrBoDkMhCN3TmYVmmD8fmuVag7Q7ZCBITpFvYkOZnWEdGWuCb3YukBJw=="],
|
||||
|
||||
"boolean": ["boolean@3.2.0", "", {}, "sha512-d0II/GO9uf9lfUHH2BQsjxzRJZBdsjgsBiW4BvhWk/3qoKwQFjIDVN19PfX8F2D/r9PCMTtLWjYVCFrpeYUzsw=="],
|
||||
|
||||
"buffer-from": ["buffer-from@1.1.2", "", {}, "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ=="],
|
||||
|
||||
"caseless": ["caseless@0.12.0", "", {}, "sha512-4tYFyifaFfGacoiObjJegolkwSU4xQNGbVgUiNYVUxbQ2x2lUsFvY4hVgVzGiIe6WLOPqycWXA40l+PWsxthUw=="],
|
||||
|
||||
"concat-stream": ["concat-stream@2.0.0", "", { "dependencies": { "buffer-from": "^1.0.0", "inherits": "^2.0.3", "readable-stream": "^3.0.2", "typedarray": "^0.0.6" } }, "sha512-MWufYdFw53ccGjCA+Ol7XJYpAlW6/prSMzuPOTRnJGcGzuhLn4Scrz7qf6o8bROZ514ltazcIFJZevcfbo0x7A=="],
|
||||
|
||||
"debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
|
||||
|
||||
"define-data-property": ["define-data-property@1.1.4", "", { "dependencies": { "es-define-property": "^1.0.0", "es-errors": "^1.3.0", "gopd": "^1.0.1" } }, "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A=="],
|
||||
|
||||
"define-properties": ["define-properties@1.2.1", "", { "dependencies": { "define-data-property": "^1.0.1", "has-property-descriptors": "^1.0.0", "object-keys": "^1.1.1" } }, "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg=="],
|
||||
|
||||
"detect-node": ["detect-node@2.1.0", "", {}, "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g=="],
|
||||
|
||||
"discord-api-types": ["discord-api-types@0.38.47", "", {}, "sha512-XgXQodHQBAE6kfD7kMvVo30863iHX1LHSqNq6MGUTDwIFCCvHva13+rwxyxVXDqudyApMNAd32PGjgVETi5rjA=="],
|
||||
|
||||
"discord.js": ["discord.js@14.26.3", "", { "dependencies": { "@discordjs/builders": "^1.14.1", "@discordjs/collection": "1.5.3", "@discordjs/formatters": "^0.6.2", "@discordjs/rest": "^2.6.1", "@discordjs/util": "^1.2.0", "@discordjs/ws": "^1.2.3", "@sapphire/snowflake": "3.5.3", "discord-api-types": "^0.38.40", "fast-deep-equal": "3.1.3", "lodash.snakecase": "4.1.1", "magic-bytes.js": "^1.13.0", "tslib": "^2.6.3", "undici": "6.24.1" } }, "sha512-XEKtYn28YFsiJ5l4fLRyikdbo6RD5oFyqfVHQlvXz2104JhH/E8slN28dbky05w3DCrJcNVWvhVvcJCTSl/KIg=="],
|
||||
|
||||
"dotenv": ["dotenv@17.4.2", "", {}, "sha512-nI4U3TottKAcAD9LLud4Cb7b2QztQMUEfHbvhTH09bqXTxnSie8WnjPALV/WMCrJZ6UV/qHJ6L03OqO3LcdYZw=="],
|
||||
|
||||
"env-paths": ["env-paths@2.2.1", "", {}, "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A=="],
|
||||
|
||||
"es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="],
|
||||
|
||||
"es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="],
|
||||
|
||||
"es6-error": ["es6-error@4.1.1", "", {}, "sha512-Um/+FxMr9CISWh0bi5Zv0iOD+4cFh5qLeks1qhAopKVAJw3drgKbKySikp7wGhDL0HPeaja0P5ULZrxLkniUVg=="],
|
||||
|
||||
"escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="],
|
||||
|
||||
"fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="],
|
||||
|
||||
"ffmpeg-static": ["ffmpeg-static@5.3.0", "", { "dependencies": { "@derhuerst/http-basic": "^8.2.0", "env-paths": "^2.2.0", "https-proxy-agent": "^5.0.0", "progress": "^2.0.3" } }, "sha512-H+K6sW6TiIX6VGend0KQwthe+kaceeH/luE8dIZyOP35ik7ahYojDuqlTV1bOrtEwl01sy2HFNGQfi5IDJvotg=="],
|
||||
|
||||
"global-agent": ["global-agent@3.0.0", "", { "dependencies": { "boolean": "^3.0.1", "es6-error": "^4.1.1", "matcher": "^3.0.0", "roarr": "^2.15.3", "semver": "^7.3.2", "serialize-error": "^7.0.1" } }, "sha512-PT6XReJ+D07JvGoxQMkT6qji/jVNfX/h364XHZOWeRzy64sSFr+xJ5OX7LI3b4MPQzdL4H8Y8M0xzPpsVMwA8Q=="],
|
||||
|
||||
"globalthis": ["globalthis@1.0.4", "", { "dependencies": { "define-properties": "^1.2.1", "gopd": "^1.0.1" } }, "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ=="],
|
||||
|
||||
"gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="],
|
||||
|
||||
"has-property-descriptors": ["has-property-descriptors@1.0.2", "", { "dependencies": { "es-define-property": "^1.0.0" } }, "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg=="],
|
||||
|
||||
"http-response-object": ["http-response-object@3.0.2", "", { "dependencies": { "@types/node": "^10.0.3" } }, "sha512-bqX0XTF6fnXSQcEJ2Iuyr75yVakyjIDCqroJQ/aHfSdlM743Cwqoi2nDYMzLGWUcuTWGWy8AAvOKXTfiv6q9RA=="],
|
||||
|
||||
"https-proxy-agent": ["https-proxy-agent@5.0.1", "", { "dependencies": { "agent-base": "6", "debug": "4" } }, "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA=="],
|
||||
|
||||
"inherits": ["inherits@2.0.4", "", {}, "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ=="],
|
||||
|
||||
"json-stringify-safe": ["json-stringify-safe@5.0.1", "", {}, "sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA=="],
|
||||
|
||||
"lodash": ["lodash@4.18.1", "", {}, "sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q=="],
|
||||
|
||||
"lodash.snakecase": ["lodash.snakecase@4.1.1", "", {}, "sha512-QZ1d4xoBHYUeuouhEq3lk3Uq7ldgyFXGBhg04+oRLnIz8o9T65Eh+8YdroUwn846zchkA9yDsDl5CVVaV2nqYw=="],
|
||||
|
||||
"magic-bytes.js": ["magic-bytes.js@1.13.0", "", {}, "sha512-afO2mnxW7GDTXMm5/AoN1WuOcdoKhtgXjIvHmobqTD1grNplhGdv3PFOyjCVmrnOZBIT/gD/koDKpYG+0mvHcg=="],
|
||||
|
||||
"matcher": ["matcher@3.0.0", "", { "dependencies": { "escape-string-regexp": "^4.0.0" } }, "sha512-OkeDaAZ/bQCxeFAozM55PKcKU0yJMPGifLwV4Qgjitu+5MoAfSQN4lsLJeXZ1b8w0x+/Emda6MZgXS1jvsapng=="],
|
||||
|
||||
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
|
||||
|
||||
"object-keys": ["object-keys@1.1.1", "", {}, "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA=="],
|
||||
|
||||
"onnxruntime-common": ["onnxruntime-common@1.24.3", "", {}, "sha512-GeuPZO6U/LBJXvwdaqHbuUmoXiEdeCjWi/EG7Y1HNnDwJYuk6WUbNXpF6luSUY8yASul3cmUlLGrCCL1ZgVXqA=="],
|
||||
|
||||
"onnxruntime-node": ["onnxruntime-node@1.24.3", "", { "dependencies": { "adm-zip": "^0.5.16", "global-agent": "^3.0.0", "onnxruntime-common": "1.24.3" }, "os": [ "linux", "win32", "darwin", ] }, "sha512-JH7+czbc8ALA819vlTgcV+Q214/+VjGeBHDjX81+ZCD0PCVCIFGFNtT0V4sXG/1JXypKPgScQcB3ij/hk3YnTg=="],
|
||||
|
||||
"openai": ["openai@6.35.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-L/skwIGnt5xQZHb0UfTu9uAUKbis3ehKypOuJKi20QvG7UStV6C8IC3myGYHcdiF4kms/bAvOJ9UqqNWqi8x/Q=="],
|
||||
|
||||
"opusscript": ["opusscript@0.1.1", "", {}, "sha512-mL0fZZOUnXdZ78woRXp18lApwpp0lF5tozJOD1Wut0dgrA9WuQTgSels/CSmFleaAZrJi/nci5KOVtbuxeWoQA=="],
|
||||
|
||||
"parse-cache-control": ["parse-cache-control@1.0.1", "", {}, "sha512-60zvsJReQPX5/QP0Kzfd/VrpjScIQ7SHBW6bFCYfEP+fp0Eppr1SHhIO5nd1PjZtvclzSzES9D/p5nFJurwfWg=="],
|
||||
|
||||
"prism-media": ["prism-media@1.3.5", "", { "peerDependencies": { "@discordjs/opus": ">=0.8.0 <1.0.0", "ffmpeg-static": "^5.0.2 || ^4.2.7 || ^3.0.0 || ^2.4.0", "node-opus": "^0.3.3", "opusscript": "^0.0.8" }, "optionalPeers": ["@discordjs/opus", "ffmpeg-static", "node-opus", "opusscript"] }, "sha512-IQdl0Q01m4LrkN1EGIE9lphov5Hy7WWlH6ulf5QdGePLlPas9p2mhgddTEHrlaXYjjFToM1/rWuwF37VF4taaA=="],
|
||||
|
||||
"progress": ["progress@2.0.3", "", {}, "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA=="],
|
||||
|
||||
"readable-stream": ["readable-stream@3.6.2", "", { "dependencies": { "inherits": "^2.0.3", "string_decoder": "^1.1.1", "util-deprecate": "^1.0.1" } }, "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA=="],
|
||||
|
||||
"roarr": ["roarr@2.15.4", "", { "dependencies": { "boolean": "^3.0.1", "detect-node": "^2.0.4", "globalthis": "^1.0.1", "json-stringify-safe": "^5.0.1", "semver-compare": "^1.0.0", "sprintf-js": "^1.1.2" } }, "sha512-CHhPh+UNHD2GTXNYhPWLnU8ONHdI+5DI+4EYIAOaiD63rHeYlZvyh8P+in5999TTSFgUYuKUAjzRI4mdh/p+2A=="],
|
||||
|
||||
"safe-buffer": ["safe-buffer@5.2.1", "", {}, "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ=="],
|
||||
|
||||
"semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="],
|
||||
|
||||
"semver-compare": ["semver-compare@1.0.0", "", {}, "sha512-YM3/ITh2MJ5MtzaM429anh+x2jiLVjqILF4m4oyQB18W7Ggea7BfqdH/wGMK7dDiMghv/6WG7znWMwUDzJiXow=="],
|
||||
|
||||
"serialize-error": ["serialize-error@7.0.1", "", { "dependencies": { "type-fest": "^0.13.1" } }, "sha512-8I8TjW5KMOKsZQTvoxjuSIa7foAwPWGOts+6o7sgjz41/qMD9VQHEDxi6PBvK2l0MXUmqZyNpUK+T2tQaaElvw=="],
|
||||
|
||||
"sprintf-js": ["sprintf-js@1.1.3", "", {}, "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA=="],
|
||||
|
||||
"string_decoder": ["string_decoder@1.3.0", "", { "dependencies": { "safe-buffer": "~5.2.0" } }, "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA=="],
|
||||
|
||||
"ts-mixer": ["ts-mixer@6.0.4", "", {}, "sha512-ufKpbmrugz5Aou4wcr5Wc1UUFWOLhq+Fm6qa6P0w0K5Qw2yhaUoiWszhCVuNQyNwrlGiscHOmqYoAox1PtvgjA=="],
|
||||
|
||||
"tslib": ["tslib@2.8.1", "", {}, "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w=="],
|
||||
|
||||
"type-fest": ["type-fest@0.13.1", "", {}, "sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg=="],
|
||||
|
||||
"typedarray": ["typedarray@0.0.6", "", {}, "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA=="],
|
||||
|
||||
"typescript": ["typescript@6.0.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-y2TvuxSZPDyQakkFRPZHKFm+KKVqIisdg9/CZwm9ftvKXLP8NRWj38/ODjNbr43SsoXqNuAisEf1GdCxqWcdBw=="],
|
||||
|
||||
"undici": ["undici@6.24.1", "", {}, "sha512-sC+b0tB1whOCzbtlx20fx3WgCXwkW627p4EA9uM+/tNNPkSS+eSEld6pAs9nDv7WbY1UUljBMYPtu9BCOrCWKA=="],
|
||||
|
||||
"undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="],
|
||||
|
||||
"util-deprecate": ["util-deprecate@1.0.2", "", {}, "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw=="],
|
||||
|
||||
"ws": ["ws@8.20.0", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA=="],
|
||||
|
||||
"zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="],
|
||||
|
||||
"@discordjs/rest/@discordjs/collection": ["@discordjs/collection@2.1.1", "", {}, "sha512-LiSusze9Tc7qF03sLCujF5iZp7K+vRNEDBZ86FT9aQAv3vxMLihUvKvpsCWiQ2DJq1tVckopKm1rxomgNUc9hg=="],
|
||||
|
||||
"@discordjs/rest/@sapphire/snowflake": ["@sapphire/snowflake@3.5.5", "", {}, "sha512-xzvBr1Q1c4lCe7i6sRnrofxeO1QTP/LKQ6A6qy0iB4x5yfiSfARMEQEghojzTNALDTcv8En04qYNIco9/K9eZQ=="],
|
||||
|
||||
"@discordjs/ws/@discordjs/collection": ["@discordjs/collection@2.1.1", "", {}, "sha512-LiSusze9Tc7qF03sLCujF5iZp7K+vRNEDBZ86FT9aQAv3vxMLihUvKvpsCWiQ2DJq1tVckopKm1rxomgNUc9hg=="],
|
||||
|
||||
"http-response-object/@types/node": ["@types/node@10.17.60", "", {}, "sha512-F0KIgDJfy2nA3zMLmWGKxcH2ZVEtCZXHHdOQs2gSaQ27+lNeEfGxzkIw90aXswATX7AZ33tahPbzy6KAfUreVw=="],
|
||||
}
|
||||
}
|
||||
35
package.json
Normal file
35
package.json
Normal file
@@ -0,0 +1,35 @@
|
||||
{
|
||||
"name": "realtime_voice_bot",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "bun --watch src/index.ts",
|
||||
"start": "bun src/index.ts",
|
||||
"check": "tsc --noEmit",
|
||||
"build": "tsc -p tsconfig.json"
|
||||
},
|
||||
"engines": {
|
||||
"bun": ">=1.3.0",
|
||||
"node": ">=22.12.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"@discordjs/voice": "^0.19.2",
|
||||
"avr-vad": "^1.0.10",
|
||||
"discord.js": "^14.26.3",
|
||||
"dotenv": "^17.4.2",
|
||||
"ffmpeg-static": "^5.3.0",
|
||||
"openai": "^6.35.0",
|
||||
"opusscript": "^0.1.1",
|
||||
"prism-media": "^1.3.5",
|
||||
"ws": "^8.20.0",
|
||||
"zod": "^4.3.6"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^25.6.0",
|
||||
"typescript": "^6.0.3"
|
||||
},
|
||||
"trustedDependencies": [
|
||||
"onnxruntime-node"
|
||||
]
|
||||
}
|
||||
452
src/audio/guild-voice-session.ts
Normal file
452
src/audio/guild-voice-session.ts
Normal file
@@ -0,0 +1,452 @@
|
||||
import { EventEmitter } from "node:events";
|
||||
|
||||
import prism from "prism-media";
|
||||
import { RealTimeVAD } from "avr-vad";
|
||||
import {
|
||||
AudioPlayerStatus,
|
||||
EndBehaviorType,
|
||||
NoSubscriberBehavior,
|
||||
VoiceConnectionStatus,
|
||||
createAudioPlayer,
|
||||
entersState,
|
||||
joinVoiceChannel,
|
||||
type AudioPlayer,
|
||||
type AudioReceiveStream,
|
||||
type VoiceConnection,
|
||||
} from "@discordjs/voice";
|
||||
import type { Client, Guild, VoiceBasedChannel } from "discord.js";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import { Logger } from "../logger.js";
|
||||
import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js";
|
||||
import { ConversationMemory, type UserUtterance } from "../services/conversation.js";
|
||||
import { ElevenLabsSttService } from "../services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService, type PreparedSpeechPlayback } from "../services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "../services/openai-llm.js";
|
||||
|
||||
interface GuildVoiceSessionOptions {
|
||||
client: Client;
|
||||
config: AppConfig;
|
||||
logger: Logger;
|
||||
guild: Guild;
|
||||
voiceChannel: VoiceBasedChannel;
|
||||
textChannelId?: string;
|
||||
stt: ElevenLabsSttService;
|
||||
tts: ElevenLabsTtsService;
|
||||
llm: OpenAiLlmService;
|
||||
}
|
||||
|
||||
interface SpeechJob {
|
||||
text: string;
|
||||
source: "assistant" | "manual";
|
||||
}
|
||||
|
||||
class UserAudioSession {
|
||||
private readonly downsampler = new Stereo48kToMono16kDownsampler();
|
||||
private readonly pendingSamples: number[] = [];
|
||||
private readonly vad: RealTimeVAD;
|
||||
private processing = Promise.resolve();
|
||||
|
||||
private constructor(
|
||||
private readonly logger: Logger,
|
||||
private readonly speakerId: string,
|
||||
private readonly speakerName: string,
|
||||
private readonly receiveStream: AudioReceiveStream,
|
||||
private readonly decoder: NodeJS.ReadWriteStream & { destroy: () => void },
|
||||
vad: RealTimeVAD,
|
||||
private readonly onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void,
|
||||
) {
|
||||
this.vad = vad;
|
||||
}
|
||||
|
||||
static async create(options: {
|
||||
logger: Logger;
|
||||
speakerId: string;
|
||||
speakerName: string;
|
||||
receiveStream: AudioReceiveStream;
|
||||
decoder: NodeJS.ReadWriteStream & { destroy: () => void };
|
||||
onSpeechStart: () => void;
|
||||
onSpeechEnd: (utterance: UserUtterance, audio: Float32Array) => void;
|
||||
}): Promise<UserAudioSession> {
|
||||
const vadInstance = await RealTimeVAD.new({
|
||||
model: "v5",
|
||||
sampleRate: 16000,
|
||||
frameSamples: 1536,
|
||||
positiveSpeechThreshold: 0.55,
|
||||
negativeSpeechThreshold: 0.35,
|
||||
redemptionFrames: 8,
|
||||
preSpeechPadFrames: 2,
|
||||
minSpeechFrames: 3,
|
||||
onFrameProcessed: () => undefined,
|
||||
onVADMisfire: () => undefined,
|
||||
onSpeechStart: () => {
|
||||
options.onSpeechStart();
|
||||
},
|
||||
onSpeechRealStart: () => undefined,
|
||||
onSpeechEnd: (audio: Float32Array) => {
|
||||
options.onSpeechEnd(
|
||||
{
|
||||
speakerId: options.speakerId,
|
||||
speakerName: options.speakerName,
|
||||
text: "",
|
||||
},
|
||||
audio,
|
||||
);
|
||||
},
|
||||
});
|
||||
|
||||
const session = new UserAudioSession(
|
||||
options.logger,
|
||||
options.speakerId,
|
||||
options.speakerName,
|
||||
options.receiveStream,
|
||||
options.decoder,
|
||||
vadInstance,
|
||||
options.onSpeechEnd,
|
||||
);
|
||||
|
||||
session.decoder.on("data", (chunk: Buffer) => {
|
||||
session.pushPcmChunk(chunk);
|
||||
});
|
||||
|
||||
session.decoder.on("error", (error) => {
|
||||
options.logger.warn("PCM decoder error", options.speakerId, error);
|
||||
});
|
||||
|
||||
session.receiveStream.on("error", (error) => {
|
||||
options.logger.warn("Audio receive stream error", options.speakerId, error);
|
||||
});
|
||||
|
||||
return session;
|
||||
}
|
||||
|
||||
private pushPcmChunk(chunk: Buffer): void {
|
||||
const mono16k = this.downsampler.pushStereo48kChunk(chunk);
|
||||
if (mono16k.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const sample of mono16k) {
|
||||
this.pendingSamples.push(sample);
|
||||
}
|
||||
|
||||
while (true) {
|
||||
const frame = takeFrame(this.pendingSamples, 1536);
|
||||
if (!frame) {
|
||||
return;
|
||||
}
|
||||
|
||||
const floatFrame = int16ArrayToFloat32(frame);
|
||||
this.processing = this.processing
|
||||
.then(() => this.vad.processAudio(floatFrame))
|
||||
.catch((error) => {
|
||||
this.logger.warn("VAD frame processing failed", this.speakerId, this.speakerName, error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
destroy(): void {
|
||||
this.receiveStream.destroy();
|
||||
this.decoder.destroy();
|
||||
void this.vad.destroy().catch((error) => {
|
||||
this.logger.warn("VAD destroy failed", this.speakerId, this.speakerName, error);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export class GuildVoiceSession extends EventEmitter {
|
||||
readonly guildId: string;
|
||||
readonly voiceChannelId: string;
|
||||
|
||||
private readonly connection: VoiceConnection;
|
||||
private readonly player: AudioPlayer;
|
||||
private readonly memory: ConversationMemory;
|
||||
private readonly trackedUsers = new Map<string, UserAudioSession>();
|
||||
private readonly pendingUsers = new Map<string, Promise<void>>();
|
||||
private readonly queue: SpeechJob[] = [];
|
||||
|
||||
private draining = false;
|
||||
private currentAbortController: AbortController | null = null;
|
||||
private currentPlayback: PreparedSpeechPlayback | null = null;
|
||||
private textChannelId?: string;
|
||||
|
||||
private constructor(private readonly options: GuildVoiceSessionOptions) {
|
||||
super();
|
||||
|
||||
this.guildId = options.guild.id;
|
||||
this.voiceChannelId = options.voiceChannel.id;
|
||||
this.textChannelId = options.textChannelId;
|
||||
this.memory = new ConversationMemory(options.config.MAX_CONVERSATION_TURNS);
|
||||
this.player = createAudioPlayer({
|
||||
behaviors: {
|
||||
noSubscriber: NoSubscriberBehavior.Pause,
|
||||
},
|
||||
});
|
||||
this.connection = joinVoiceChannel({
|
||||
guildId: options.guild.id,
|
||||
channelId: options.voiceChannel.id,
|
||||
adapterCreator: options.guild.voiceAdapterCreator,
|
||||
selfDeaf: false,
|
||||
selfMute: false,
|
||||
});
|
||||
}
|
||||
|
||||
static async create(options: GuildVoiceSessionOptions): Promise<GuildVoiceSession> {
|
||||
const session = new GuildVoiceSession(options);
|
||||
await session.initialize();
|
||||
return session;
|
||||
}
|
||||
|
||||
private async initialize(): Promise<void> {
|
||||
this.player.on("error", (error) => {
|
||||
this.options.logger.warn("Audio player error", this.guildId, error);
|
||||
});
|
||||
|
||||
this.connection.on("stateChange", (_oldState, newState) => {
|
||||
if (newState.status === VoiceConnectionStatus.Destroyed) {
|
||||
this.options.logger.info("Voice connection destroyed", this.guildId);
|
||||
}
|
||||
});
|
||||
|
||||
this.connection.subscribe(this.player);
|
||||
await entersState(this.connection, VoiceConnectionStatus.Ready, 30_000);
|
||||
|
||||
this.connection.receiver.speaking.on("start", (userId: string) => {
|
||||
if (userId === this.options.client.user?.id) {
|
||||
return;
|
||||
}
|
||||
|
||||
void this.ensureTrackedUser(userId);
|
||||
});
|
||||
}
|
||||
|
||||
setTextChannel(textChannelId?: string): void {
|
||||
this.textChannelId = textChannelId;
|
||||
}
|
||||
|
||||
clearConversation(): void {
|
||||
this.memory.clear();
|
||||
this.interruptPlayback("conversation-reset");
|
||||
}
|
||||
|
||||
statusSummary(): string {
|
||||
const playbackState = this.player.state.status;
|
||||
return [
|
||||
`세션 활성: 예`,
|
||||
`음성 채널: ${this.options.voiceChannel.name}`,
|
||||
`추적 유저 수: ${this.trackedUsers.size}`,
|
||||
`재생 상태: ${playbackState}`,
|
||||
`대기열: ${this.queue.length}`,
|
||||
`최근 대화 턴: ${this.memory.recentTurns().length}`,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
async speakText(text: string): Promise<void> {
|
||||
this.queue.push({
|
||||
text,
|
||||
source: "manual",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
interruptPlayback(reason: string): void {
|
||||
if (this.queue.length > 0 || this.player.state.status !== AudioPlayerStatus.Idle) {
|
||||
this.options.logger.info("Interrupting playback", this.guildId, reason);
|
||||
}
|
||||
|
||||
this.queue.splice(0, this.queue.length);
|
||||
this.currentAbortController?.abort();
|
||||
this.currentAbortController = null;
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
this.player.stop(true);
|
||||
}
|
||||
|
||||
async destroy(): Promise<void> {
|
||||
this.interruptPlayback("session-destroy");
|
||||
for (const session of this.trackedUsers.values()) {
|
||||
session.destroy();
|
||||
}
|
||||
this.trackedUsers.clear();
|
||||
this.pendingUsers.clear();
|
||||
this.connection.destroy();
|
||||
}
|
||||
|
||||
private async ensureTrackedUser(userId: string): Promise<void> {
|
||||
if (this.trackedUsers.has(userId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const existing = this.pendingUsers.get(userId);
|
||||
if (existing) {
|
||||
await existing;
|
||||
return;
|
||||
}
|
||||
|
||||
const pending = this.createTrackedUser(userId).finally(() => {
|
||||
this.pendingUsers.delete(userId);
|
||||
});
|
||||
this.pendingUsers.set(userId, pending);
|
||||
await pending;
|
||||
}
|
||||
|
||||
private async createTrackedUser(userId: string): Promise<void> {
|
||||
const speakerName = await this.resolveSpeakerName(userId);
|
||||
const receiveStream = this.connection.receiver.subscribe(userId, {
|
||||
end: {
|
||||
behavior: EndBehaviorType.Manual,
|
||||
},
|
||||
});
|
||||
|
||||
const decoder = new prism.opus.Decoder({
|
||||
rate: 48000,
|
||||
channels: 2,
|
||||
frameSize: 960,
|
||||
}) as NodeJS.ReadWriteStream & { destroy: () => void };
|
||||
|
||||
receiveStream.pipe(decoder);
|
||||
|
||||
const session = await UserAudioSession.create({
|
||||
logger: this.options.logger,
|
||||
speakerId: userId,
|
||||
speakerName,
|
||||
receiveStream,
|
||||
decoder,
|
||||
onSpeechStart: () => {
|
||||
this.interruptPlayback(`barge-in:${speakerName}`);
|
||||
},
|
||||
onSpeechEnd: (utterance, audio) => {
|
||||
void this.handleSpeechEnd(utterance, audio);
|
||||
},
|
||||
});
|
||||
|
||||
this.trackedUsers.set(userId, session);
|
||||
this.options.logger.info("Tracking speaker", this.guildId, userId, speakerName);
|
||||
}
|
||||
|
||||
private async resolveSpeakerName(userId: string): Promise<string> {
|
||||
try {
|
||||
const user = await this.options.client.users.fetch(userId);
|
||||
return user.globalName ?? user.username;
|
||||
} catch {
|
||||
return `user-${userId.slice(-6)}`;
|
||||
}
|
||||
}
|
||||
|
||||
private async handleSpeechEnd(utterance: UserUtterance, audio: Float32Array): Promise<void> {
|
||||
if (audio.length < 16000 * 0.25) {
|
||||
return;
|
||||
}
|
||||
|
||||
const pcmBuffer = float32ToPcm16Buffer(audio);
|
||||
let transcript: string | null = null;
|
||||
|
||||
try {
|
||||
transcript = await this.options.stt.transcribePcm16(pcmBuffer);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("STT failed", this.guildId, utterance.speakerId, error);
|
||||
await this.announce(`음성 인식 실패: ${utterance.speakerName}`);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!transcript || transcript.trim().length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const hydratedUtterance: UserUtterance = {
|
||||
...utterance,
|
||||
text: transcript.trim(),
|
||||
};
|
||||
|
||||
this.options.logger.info("Transcript committed", this.guildId, hydratedUtterance.speakerName, hydratedUtterance.text);
|
||||
this.memory.addUserTurn(hydratedUtterance);
|
||||
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
await this.announce(`🗣️ ${hydratedUtterance.speakerName}: ${hydratedUtterance.text}`);
|
||||
}
|
||||
|
||||
let reply: string;
|
||||
try {
|
||||
reply = await this.options.llm.generateReply(this.memory, hydratedUtterance);
|
||||
} catch (error) {
|
||||
this.options.logger.warn("LLM failed", this.guildId, utterance.speakerId, error);
|
||||
reply = "지금은 답변 생성에 실패했습니다. 잠시 후 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
this.memory.addAssistantTurn(reply);
|
||||
if (this.options.config.DEBUG_TEXT_EVENTS) {
|
||||
await this.announce(`🤖 ${reply}`);
|
||||
}
|
||||
|
||||
this.queue.push({
|
||||
text: reply,
|
||||
source: "assistant",
|
||||
});
|
||||
await this.drainQueue();
|
||||
}
|
||||
|
||||
private async drainQueue(): Promise<void> {
|
||||
if (this.draining) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.draining = true;
|
||||
|
||||
try {
|
||||
while (this.queue.length > 0) {
|
||||
const job = this.queue.shift();
|
||||
if (!job) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const abortController = new AbortController();
|
||||
this.currentAbortController = abortController;
|
||||
|
||||
try {
|
||||
this.currentPlayback = await this.options.tts.preparePlayback(job.text, abortController.signal);
|
||||
} catch (error) {
|
||||
if (abortController.signal.aborted) {
|
||||
continue;
|
||||
}
|
||||
|
||||
this.options.logger.warn("TTS synthesis failed", this.guildId, job.source, error);
|
||||
await this.announce("음성 출력 생성에 실패했습니다.");
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const resource = this.currentPlayback.resource;
|
||||
this.player.play(resource);
|
||||
|
||||
await entersState(this.player, AudioPlayerStatus.Playing, 20_000).catch(() => null);
|
||||
await entersState(this.player, AudioPlayerStatus.Idle, 300_000);
|
||||
} catch (error) {
|
||||
if (!abortController.signal.aborted) {
|
||||
this.options.logger.warn("Audio playback failed", this.guildId, error);
|
||||
}
|
||||
} finally {
|
||||
this.currentPlayback?.dispose();
|
||||
this.currentPlayback = null;
|
||||
if (this.currentAbortController === abortController) {
|
||||
this.currentAbortController = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
this.draining = false;
|
||||
}
|
||||
}
|
||||
|
||||
private async announce(message: string): Promise<void> {
|
||||
if (!this.textChannelId) {
|
||||
return;
|
||||
}
|
||||
|
||||
const channel = await this.options.client.channels.fetch(this.textChannelId).catch(() => null);
|
||||
if (!channel?.isTextBased() || !("send" in channel) || typeof channel.send !== "function") {
|
||||
return;
|
||||
}
|
||||
|
||||
await channel.send(message).catch(() => null);
|
||||
}
|
||||
}
|
||||
60
src/audio/pcm.ts
Normal file
60
src/audio/pcm.ts
Normal file
@@ -0,0 +1,60 @@
|
||||
export class Stereo48kToMono16kDownsampler {
|
||||
private readonly pendingMono48k: number[] = [];
|
||||
|
||||
pushStereo48kChunk(chunk: Buffer): Int16Array {
|
||||
if (chunk.length < 4) {
|
||||
return new Int16Array();
|
||||
}
|
||||
|
||||
for (let offset = 0; offset + 3 < chunk.length; offset += 4) {
|
||||
const left = chunk.readInt16LE(offset);
|
||||
const right = chunk.readInt16LE(offset + 2);
|
||||
this.pendingMono48k.push(Math.round((left + right) / 2));
|
||||
}
|
||||
|
||||
const outputLength = Math.floor(this.pendingMono48k.length / 3);
|
||||
if (outputLength === 0) {
|
||||
return new Int16Array();
|
||||
}
|
||||
|
||||
const output = new Int16Array(outputLength);
|
||||
let readIndex = 0;
|
||||
for (let index = 0; index < outputLength; index += 1) {
|
||||
const a = this.pendingMono48k[readIndex];
|
||||
const b = this.pendingMono48k[readIndex + 1];
|
||||
const c = this.pendingMono48k[readIndex + 2];
|
||||
output[index] = Math.round((a + b + c) / 3);
|
||||
readIndex += 3;
|
||||
}
|
||||
|
||||
this.pendingMono48k.splice(0, readIndex);
|
||||
return output;
|
||||
}
|
||||
}
|
||||
|
||||
export function int16ArrayToFloat32(input: Int16Array): Float32Array {
|
||||
const output = new Float32Array(input.length);
|
||||
for (let index = 0; index < input.length; index += 1) {
|
||||
output[index] = input[index] / 32768;
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
export function float32ToPcm16Buffer(input: Float32Array): Buffer {
|
||||
const buffer = Buffer.allocUnsafe(input.length * 2);
|
||||
for (let index = 0; index < input.length; index += 1) {
|
||||
const value = Math.max(-1, Math.min(1, input[index]));
|
||||
const scaled = value < 0 ? value * 32768 : value * 32767;
|
||||
buffer.writeInt16LE(Math.round(scaled), index * 2);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
export function takeFrame(source: number[], frameSize: number): Int16Array | null {
|
||||
if (source.length < frameSize) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const values = source.splice(0, frameSize);
|
||||
return Int16Array.from(values);
|
||||
}
|
||||
29
src/config.ts
Normal file
29
src/config.ts
Normal file
@@ -0,0 +1,29 @@
|
||||
import { config as loadDotenv } from "dotenv";
|
||||
import { z } from "zod";
|
||||
|
||||
loadDotenv();
|
||||
|
||||
const envSchema = z.object({
|
||||
DISCORD_BOT_TOKEN: z.string().min(1),
|
||||
DISCORD_APPLICATION_ID: z.string().min(1),
|
||||
DISCORD_COMMAND_GUILD_ID: z.string().min(1).optional(),
|
||||
OPENAI_API_KEY: z.string().min(1),
|
||||
OPENAI_MODEL: z.string().min(1).default("gpt-5.4-mini"),
|
||||
ELEVENLABS_API_KEY: z.string().min(1),
|
||||
ELEVENLABS_VOICE_ID: z.string().min(1),
|
||||
ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"),
|
||||
ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"),
|
||||
BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"),
|
||||
MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12),
|
||||
DEBUG_TEXT_EVENTS: z
|
||||
.string()
|
||||
.optional()
|
||||
.transform((value) => value === "true"),
|
||||
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
|
||||
});
|
||||
|
||||
export type AppConfig = z.infer<typeof envSchema>;
|
||||
|
||||
export function loadConfig(): AppConfig {
|
||||
return envSchema.parse(process.env);
|
||||
}
|
||||
240
src/index.ts
Normal file
240
src/index.ts
Normal file
@@ -0,0 +1,240 @@
|
||||
import process from "node:process";
|
||||
|
||||
import {
|
||||
GatewayIntentBits,
|
||||
REST,
|
||||
Routes,
|
||||
SlashCommandBuilder,
|
||||
type ChatInputCommandInteraction,
|
||||
type Client,
|
||||
type GuildMember,
|
||||
type VoiceBasedChannel,
|
||||
} from "discord.js";
|
||||
import { Client as DiscordClient } from "discord.js";
|
||||
|
||||
import { GuildVoiceSession } from "./audio/guild-voice-session.js";
|
||||
import { loadConfig } from "./config.js";
|
||||
import { Logger } from "./logger.js";
|
||||
import { ElevenLabsSttService } from "./services/elevenlabs-stt.js";
|
||||
import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js";
|
||||
import { OpenAiLlmService } from "./services/openai-llm.js";
|
||||
|
||||
const config = loadConfig();
|
||||
const logger = new Logger(config.LOG_LEVEL);
|
||||
|
||||
const commands = [
|
||||
new SlashCommandBuilder().setName("join").setDescription("현재 들어가 있는 음성 채널에 봇을 입장시킵니다."),
|
||||
new SlashCommandBuilder().setName("leave").setDescription("현재 음성 세션을 종료합니다."),
|
||||
new SlashCommandBuilder().setName("status").setDescription("현재 음성 세션 상태를 확인합니다."),
|
||||
new SlashCommandBuilder().setName("reset").setDescription("대화 문맥과 재생 큐를 초기화합니다."),
|
||||
new SlashCommandBuilder()
|
||||
.setName("say")
|
||||
.setDescription("텍스트를 바로 음성으로 읽습니다.")
|
||||
.addStringOption((option) =>
|
||||
option.setName("text").setDescription("읽을 문장").setRequired(true).setMaxLength(400),
|
||||
),
|
||||
].map((command) => command.toJSON());
|
||||
|
||||
const client = new DiscordClient({
|
||||
intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates],
|
||||
});
|
||||
|
||||
const stt = new ElevenLabsSttService(config);
|
||||
const tts = new ElevenLabsTtsService(config);
|
||||
const llm = new OpenAiLlmService(config);
|
||||
const sessions = new Map<string, GuildVoiceSession>();
|
||||
|
||||
function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null {
|
||||
const member = interaction.member as GuildMember | null;
|
||||
return member?.voice.channel ?? null;
|
||||
}
|
||||
|
||||
async function registerCommands(appClient: Client): Promise<void> {
|
||||
const rest = new REST({ version: "10" }).setToken(config.DISCORD_BOT_TOKEN);
|
||||
if (config.DISCORD_COMMAND_GUILD_ID) {
|
||||
await rest.put(
|
||||
Routes.applicationGuildCommands(config.DISCORD_APPLICATION_ID, config.DISCORD_COMMAND_GUILD_ID),
|
||||
{
|
||||
body: commands,
|
||||
},
|
||||
);
|
||||
logger.info("Registered guild commands", config.DISCORD_COMMAND_GUILD_ID);
|
||||
return;
|
||||
}
|
||||
|
||||
await rest.put(Routes.applicationCommands(config.DISCORD_APPLICATION_ID), {
|
||||
body: commands,
|
||||
});
|
||||
logger.info("Registered global commands");
|
||||
}
|
||||
|
||||
async function createSession(interaction: ChatInputCommandInteraction): Promise<GuildVoiceSession> {
|
||||
if (!interaction.guild) {
|
||||
throw new Error("Guild interaction required");
|
||||
}
|
||||
|
||||
const voiceChannel = getVoiceChannel(interaction);
|
||||
if (!voiceChannel) {
|
||||
throw new Error("먼저 음성 채널에 들어가 주세요.");
|
||||
}
|
||||
|
||||
const existing = sessions.get(interaction.guild.id);
|
||||
if (existing && existing.voiceChannelId === voiceChannel.id) {
|
||||
existing.setTextChannel(interaction.channelId);
|
||||
return existing;
|
||||
}
|
||||
|
||||
if (existing) {
|
||||
await existing.destroy();
|
||||
sessions.delete(interaction.guild.id);
|
||||
}
|
||||
|
||||
const session = await GuildVoiceSession.create({
|
||||
client,
|
||||
config,
|
||||
logger,
|
||||
guild: interaction.guild,
|
||||
voiceChannel,
|
||||
textChannelId: interaction.channelId,
|
||||
stt,
|
||||
tts,
|
||||
llm,
|
||||
});
|
||||
sessions.set(interaction.guild.id, session);
|
||||
return session;
|
||||
}
|
||||
|
||||
async function handleJoin(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
try {
|
||||
const session = await createSession(interaction);
|
||||
await interaction.editReply(`음성 비서를 시작했습니다. 채널: ${session.statusSummary().split("\n")[1]?.replace("음성 채널: ", "") ?? "알 수 없음"}`);
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : "세션 생성에 실패했습니다.";
|
||||
await interaction.editReply(message);
|
||||
}
|
||||
}
|
||||
|
||||
async function handleLeave(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await session.destroy();
|
||||
sessions.delete(interaction.guildId!);
|
||||
await interaction.reply({ content: "음성 세션을 종료했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleStatus(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
await interaction.reply({
|
||||
content: session.statusSummary(),
|
||||
ephemeral: true,
|
||||
});
|
||||
}
|
||||
|
||||
async function handleReset(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.reply({ content: "현재 활성화된 음성 세션이 없습니다.", ephemeral: true });
|
||||
return;
|
||||
}
|
||||
|
||||
session.clearConversation();
|
||||
await interaction.reply({ content: "대화 문맥과 재생 큐를 초기화했습니다.", ephemeral: true });
|
||||
}
|
||||
|
||||
async function handleSay(interaction: ChatInputCommandInteraction): Promise<void> {
|
||||
await interaction.deferReply({ ephemeral: true });
|
||||
|
||||
const session = interaction.guild ? sessions.get(interaction.guild.id) : undefined;
|
||||
if (!session) {
|
||||
await interaction.editReply("먼저 `/join` 으로 음성 세션을 시작해 주세요.");
|
||||
return;
|
||||
}
|
||||
|
||||
const text = interaction.options.getString("text", true).trim();
|
||||
await session.speakText(text);
|
||||
await interaction.editReply("읽기 요청을 대기열에 추가했습니다.");
|
||||
}
|
||||
|
||||
async function shutdown(exitCode = 0): Promise<void> {
|
||||
logger.info("Shutting down");
|
||||
for (const session of sessions.values()) {
|
||||
await session.destroy().catch((error) => {
|
||||
logger.warn("Session shutdown failed", error);
|
||||
});
|
||||
}
|
||||
sessions.clear();
|
||||
await client.destroy();
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
client.once("ready", async () => {
|
||||
logger.info("Discord client ready", client.user?.tag ?? "unknown");
|
||||
try {
|
||||
await registerCommands(client);
|
||||
} catch (error) {
|
||||
logger.error("Command registration failed", error);
|
||||
}
|
||||
});
|
||||
|
||||
client.on("interactionCreate", async (interaction) => {
|
||||
if (!interaction.isChatInputCommand()) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
switch (interaction.commandName) {
|
||||
case "join":
|
||||
await handleJoin(interaction);
|
||||
return;
|
||||
case "leave":
|
||||
await handleLeave(interaction);
|
||||
return;
|
||||
case "status":
|
||||
await handleStatus(interaction);
|
||||
return;
|
||||
case "reset":
|
||||
await handleReset(interaction);
|
||||
return;
|
||||
case "say":
|
||||
await handleSay(interaction);
|
||||
return;
|
||||
default:
|
||||
await interaction.reply({ content: "알 수 없는 명령입니다.", ephemeral: true });
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error("Interaction handler failed", error);
|
||||
if (interaction.deferred || interaction.replied) {
|
||||
await interaction.editReply("명령 처리 중 오류가 발생했습니다.").catch(() => null);
|
||||
return;
|
||||
}
|
||||
await interaction.reply({ content: "명령 처리 중 오류가 발생했습니다.", ephemeral: true }).catch(() => null);
|
||||
}
|
||||
});
|
||||
|
||||
process.on("SIGINT", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
process.on("SIGTERM", () => {
|
||||
void shutdown(0);
|
||||
});
|
||||
|
||||
async function main(): Promise<void> {
|
||||
await client.login(config.DISCORD_BOT_TOKEN);
|
||||
}
|
||||
|
||||
void main().catch((error) => {
|
||||
logger.error("Fatal startup error", error);
|
||||
process.exit(1);
|
||||
});
|
||||
63
src/logger.ts
Normal file
63
src/logger.ts
Normal file
@@ -0,0 +1,63 @@
|
||||
type LogLevel = "debug" | "info" | "warn" | "error";
|
||||
|
||||
const levelOrder: Record<LogLevel, number> = {
|
||||
debug: 10,
|
||||
info: 20,
|
||||
warn: 30,
|
||||
error: 40,
|
||||
};
|
||||
|
||||
function formatParts(parts: unknown[]): string {
|
||||
return parts
|
||||
.map((part) => {
|
||||
if (part instanceof Error) {
|
||||
return `${part.name}: ${part.message}`;
|
||||
}
|
||||
if (typeof part === "string") {
|
||||
return part;
|
||||
}
|
||||
return JSON.stringify(part);
|
||||
})
|
||||
.join(" ");
|
||||
}
|
||||
|
||||
export class Logger {
|
||||
constructor(private readonly level: LogLevel) {}
|
||||
|
||||
private shouldLog(target: LogLevel): boolean {
|
||||
return levelOrder[target] >= levelOrder[this.level];
|
||||
}
|
||||
|
||||
private write(target: LogLevel, ...parts: unknown[]): void {
|
||||
if (!this.shouldLog(target)) {
|
||||
return;
|
||||
}
|
||||
|
||||
const line = `[${new Date().toISOString()}] [${target.toUpperCase()}] ${formatParts(parts)}`;
|
||||
if (target === "error") {
|
||||
console.error(line);
|
||||
return;
|
||||
}
|
||||
if (target === "warn") {
|
||||
console.warn(line);
|
||||
return;
|
||||
}
|
||||
console.log(line);
|
||||
}
|
||||
|
||||
debug(...parts: unknown[]): void {
|
||||
this.write("debug", ...parts);
|
||||
}
|
||||
|
||||
info(...parts: unknown[]): void {
|
||||
this.write("info", ...parts);
|
||||
}
|
||||
|
||||
warn(...parts: unknown[]): void {
|
||||
this.write("warn", ...parts);
|
||||
}
|
||||
|
||||
error(...parts: unknown[]): void {
|
||||
this.write("error", ...parts);
|
||||
}
|
||||
}
|
||||
77
src/services/conversation.ts
Normal file
77
src/services/conversation.ts
Normal file
@@ -0,0 +1,77 @@
|
||||
export interface ConversationTurn {
|
||||
role: "user" | "assistant";
|
||||
text: string;
|
||||
speakerId?: string;
|
||||
speakerName?: string;
|
||||
createdAt: number;
|
||||
}
|
||||
|
||||
export interface UserUtterance {
|
||||
speakerId: string;
|
||||
speakerName: string;
|
||||
text: string;
|
||||
}
|
||||
|
||||
export class ConversationMemory {
|
||||
private readonly turns: ConversationTurn[] = [];
|
||||
|
||||
constructor(private readonly maxTurns: number) {}
|
||||
|
||||
addUserTurn(utterance: UserUtterance): void {
|
||||
this.turns.push({
|
||||
role: "user",
|
||||
text: utterance.text,
|
||||
speakerId: utterance.speakerId,
|
||||
speakerName: utterance.speakerName,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
addAssistantTurn(text: string): void {
|
||||
this.turns.push({
|
||||
role: "assistant",
|
||||
text,
|
||||
createdAt: Date.now(),
|
||||
});
|
||||
this.trim();
|
||||
}
|
||||
|
||||
clear(): void {
|
||||
this.turns.splice(0, this.turns.length);
|
||||
}
|
||||
|
||||
recentTurns(): ConversationTurn[] {
|
||||
return [...this.turns];
|
||||
}
|
||||
|
||||
buildPrompt(currentUtterance: UserUtterance): string {
|
||||
const recent = this.turns
|
||||
.slice(-this.maxTurns)
|
||||
.map((turn) => {
|
||||
if (turn.role === "assistant") {
|
||||
return `[assistant]\n${turn.text}`;
|
||||
}
|
||||
return `[user speaker_id=${turn.speakerId ?? "unknown"} speaker_name=${turn.speakerName ?? "unknown"}]\n${turn.text}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
|
||||
const historyBlock = recent.length > 0 ? recent : "(이전 대화 없음)";
|
||||
|
||||
return [
|
||||
"최근 대화:",
|
||||
historyBlock,
|
||||
"",
|
||||
"이번 발화:",
|
||||
`[user speaker_id=${currentUtterance.speakerId} speaker_name=${currentUtterance.speakerName}]`,
|
||||
currentUtterance.text,
|
||||
].join("\n");
|
||||
}
|
||||
|
||||
private trim(): void {
|
||||
const overflow = this.turns.length - this.maxTurns;
|
||||
if (overflow > 0) {
|
||||
this.turns.splice(0, overflow);
|
||||
}
|
||||
}
|
||||
}
|
||||
124
src/services/elevenlabs-stt.ts
Normal file
124
src/services/elevenlabs-stt.ts
Normal file
@@ -0,0 +1,124 @@
|
||||
import WebSocket from "ws";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
interface ElevenLabsMessage {
|
||||
message_type?: string;
|
||||
text?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
const NON_FATAL_ERROR_TYPES = new Set([
|
||||
"insufficient_audio_activity",
|
||||
]);
|
||||
|
||||
export class ElevenLabsSttService {
|
||||
constructor(private readonly config: AppConfig) {}
|
||||
|
||||
async transcribePcm16(pcm16MonoAudio: Buffer): Promise<string | null> {
|
||||
if (pcm16MonoAudio.byteLength === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime");
|
||||
url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL);
|
||||
url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE);
|
||||
url.searchParams.set("audio_format", "pcm_16000");
|
||||
url.searchParams.set("commit_strategy", "manual");
|
||||
url.searchParams.set("include_timestamps", "false");
|
||||
url.searchParams.set("include_language_detection", "false");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
return await new Promise<string | null>((resolve, reject) => {
|
||||
const socket = new WebSocket(url, {
|
||||
headers: {
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
});
|
||||
|
||||
let settled = false;
|
||||
let lastTranscript = "";
|
||||
|
||||
const timeout = setTimeout(() => {
|
||||
finish(lastTranscript || null);
|
||||
}, 15_000);
|
||||
|
||||
const finish = (result: string | null, error?: Error) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
clearTimeout(timeout);
|
||||
try {
|
||||
socket.close();
|
||||
} catch {
|
||||
// Ignore close race.
|
||||
}
|
||||
|
||||
if (error) {
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
resolve(result);
|
||||
};
|
||||
|
||||
socket.on("message", (raw) => {
|
||||
let message: ElevenLabsMessage;
|
||||
try {
|
||||
message = JSON.parse(raw.toString()) as ElevenLabsMessage;
|
||||
} catch (error) {
|
||||
finish(null, error as Error);
|
||||
return;
|
||||
}
|
||||
|
||||
switch (message.message_type) {
|
||||
case "session_started":
|
||||
socket.send(
|
||||
JSON.stringify({
|
||||
message_type: "input_audio_chunk",
|
||||
audio_base_64: pcm16MonoAudio.toString("base64"),
|
||||
commit: true,
|
||||
sample_rate: 16000,
|
||||
}),
|
||||
);
|
||||
return;
|
||||
case "partial_transcript":
|
||||
return;
|
||||
case "committed_transcript":
|
||||
case "committed_transcript_with_timestamps": {
|
||||
const transcript = message.text?.trim() ?? "";
|
||||
if (transcript.length > 0) {
|
||||
lastTranscript = transcript;
|
||||
finish(transcript);
|
||||
}
|
||||
return;
|
||||
}
|
||||
default:
|
||||
if (!message.message_type?.endsWith("error") && !message.message_type) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) {
|
||||
finish(null);
|
||||
return;
|
||||
}
|
||||
|
||||
finish(
|
||||
null,
|
||||
new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`),
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
socket.on("error", (error) => {
|
||||
finish(null, error as Error);
|
||||
});
|
||||
|
||||
socket.on("close", () => {
|
||||
if (!settled) {
|
||||
finish(lastTranscript || null);
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
83
src/services/elevenlabs-tts.ts
Normal file
83
src/services/elevenlabs-tts.ts
Normal file
@@ -0,0 +1,83 @@
|
||||
import { Readable } from "node:stream";
|
||||
|
||||
import ffmpegStatic from "ffmpeg-static";
|
||||
import prism from "prism-media";
|
||||
import { StreamType, createAudioResource, type AudioResource } from "@discordjs/voice";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
|
||||
export interface PreparedSpeechPlayback {
|
||||
resource: AudioResource;
|
||||
dispose: () => void;
|
||||
}
|
||||
|
||||
export class ElevenLabsTtsService {
|
||||
constructor(private readonly config: AppConfig) {
|
||||
const resolvedFfmpegPath = ffmpegStatic as unknown as string | null;
|
||||
if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) {
|
||||
process.env.FFMPEG_PATH = resolvedFfmpegPath;
|
||||
}
|
||||
}
|
||||
|
||||
async preparePlayback(text: string, signal?: AbortSignal): Promise<PreparedSpeechPlayback> {
|
||||
const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`);
|
||||
url.searchParams.set("output_format", "mp3_44100_128");
|
||||
url.searchParams.set("enable_logging", "false");
|
||||
|
||||
const response = await fetch(url, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
"xi-api-key": this.config.ELEVENLABS_API_KEY,
|
||||
},
|
||||
body: JSON.stringify({
|
||||
text,
|
||||
model_id: this.config.ELEVENLABS_TTS_MODEL,
|
||||
language_code: this.config.BOT_DEFAULT_LANGUAGE,
|
||||
voice_settings: {
|
||||
stability: 0.35,
|
||||
similarity_boost: 0.75,
|
||||
speed: 1.05,
|
||||
},
|
||||
}),
|
||||
signal,
|
||||
});
|
||||
|
||||
if (!response.ok || !response.body) {
|
||||
throw new Error(`ElevenLabs TTS request failed with status ${response.status}`);
|
||||
}
|
||||
|
||||
const input = Readable.fromWeb(response.body as never);
|
||||
const ffmpeg = new prism.FFmpeg({
|
||||
args: [
|
||||
"-analyzeduration",
|
||||
"0",
|
||||
"-loglevel",
|
||||
"0",
|
||||
"-i",
|
||||
"pipe:0",
|
||||
"-f",
|
||||
"s16le",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"2",
|
||||
"pipe:1",
|
||||
],
|
||||
});
|
||||
|
||||
input.pipe(ffmpeg);
|
||||
|
||||
const resource = createAudioResource(ffmpeg, {
|
||||
inputType: StreamType.Raw,
|
||||
});
|
||||
|
||||
return {
|
||||
resource,
|
||||
dispose: () => {
|
||||
input.destroy();
|
||||
ffmpeg.destroy();
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
64
src/services/openai-llm.ts
Normal file
64
src/services/openai-llm.ts
Normal file
@@ -0,0 +1,64 @@
|
||||
import OpenAI from "openai";
|
||||
|
||||
import type { AppConfig } from "../config.js";
|
||||
import type { ConversationMemory, UserUtterance } from "./conversation.js";
|
||||
|
||||
const ASSISTANT_INSTRUCTIONS = [
|
||||
"너는 디스코드 음성 채널에서 동작하는 한국어 음성 비서다.",
|
||||
"답변은 짧고 실용적으로 한다.",
|
||||
"기본은 한 문장, 길어도 두 문장을 넘기지 않는다.",
|
||||
"말투는 자연스러운 한국어로 유지한다.",
|
||||
"speaker_id와 speaker_name은 화자 구분용이므로 필요할 때만 자연스럽게 반영한다.",
|
||||
"잘 못 들었거나 의미가 불명확하면 짧게 다시 물어본다.",
|
||||
"목록, 마크다운, 코드블록은 쓰지 않는다.",
|
||||
].join(" ");
|
||||
|
||||
function normalizeReply(text: string): string {
|
||||
const compact = text.replace(/\s+/g, " ").trim();
|
||||
if (compact.length <= 180) {
|
||||
return compact;
|
||||
}
|
||||
|
||||
const sentences = compact.match(/[^.!?]+[.!?]?/g);
|
||||
if (!sentences || sentences.length === 0) {
|
||||
return compact.slice(0, 180).trim();
|
||||
}
|
||||
|
||||
return sentences.slice(0, 2).join(" ").trim().slice(0, 180).trim();
|
||||
}
|
||||
|
||||
export class OpenAiLlmService {
|
||||
private readonly client: OpenAI;
|
||||
|
||||
constructor(private readonly config: AppConfig) {
|
||||
this.client = new OpenAI({
|
||||
apiKey: this.config.OPENAI_API_KEY,
|
||||
});
|
||||
}
|
||||
|
||||
async generateReply(memory: ConversationMemory, utterance: UserUtterance): Promise<string> {
|
||||
const response = await this.client.responses.create({
|
||||
model: this.config.OPENAI_MODEL,
|
||||
instructions: ASSISTANT_INSTRUCTIONS,
|
||||
input: [
|
||||
{
|
||||
role: "user",
|
||||
content: [
|
||||
{
|
||||
type: "input_text",
|
||||
text: memory.buildPrompt(utterance),
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
max_output_tokens: 120,
|
||||
});
|
||||
|
||||
const output = response.output_text?.trim();
|
||||
if (!output) {
|
||||
return "잘 못 들었습니다. 한 번만 다시 말씀해 주세요.";
|
||||
}
|
||||
|
||||
return normalizeReply(output);
|
||||
}
|
||||
}
|
||||
21
tsconfig.json
Normal file
21
tsconfig.json
Normal file
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"strict": true,
|
||||
"noEmit": false,
|
||||
"rootDir": "src",
|
||||
"outDir": "dist",
|
||||
"esModuleInterop": true,
|
||||
"forceConsistentCasingInFileNames": true,
|
||||
"skipLibCheck": true,
|
||||
"resolveJsonModule": true,
|
||||
"types": [
|
||||
"node"
|
||||
]
|
||||
},
|
||||
"include": [
|
||||
"src/**/*.ts"
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user