From 73546c15b9f29712428375cf1640a51af65584c0 Mon Sep 17 00:00:00 2001 From: claude-bot Date: Thu, 30 Apr 2026 03:21:30 +0900 Subject: [PATCH] Replace ElevenLabs with local STT and TTS --- .env.example | 16 +- .gitignore | 1 + README.md | 188 ++++++++--------- bun.lock | 2 +- package.json | 2 +- .../local_stt_worker.cpython-314.pyc | Bin 0 -> 9111 bytes .../local_tts_worker.cpython-314.pyc | Bin 0 -> 7848 bytes python/local_stt_worker.py | 145 ++++++++++++++ python/local_tts_worker.py | 125 ++++++++++++ python/requirements.txt | 2 + src/audio/guild-voice-session.ts | 8 +- src/audio/local-voice-session.ts | 8 +- src/config.ts | 26 +-- src/discord-main.ts | 12 +- src/local-main.ts | 13 +- src/python-runtime.ts | 90 +++++++++ src/services/elevenlabs-stt.ts | 124 ------------ src/services/elevenlabs-tts.ts | 78 -------- src/services/local-stt.ts | 43 ++++ src/services/local-tts.ts | 94 +++++++++ src/services/python-json-worker.ts | 189 ++++++++++++++++++ src/services/stt.ts | 4 + src/services/tts.ts | 11 + src/setup-local-ai.ts | 88 ++++++++ 24 files changed, 943 insertions(+), 326 deletions(-) create mode 100644 python/__pycache__/local_stt_worker.cpython-314.pyc create mode 100644 python/__pycache__/local_tts_worker.cpython-314.pyc create mode 100644 python/local_stt_worker.py create mode 100644 python/local_tts_worker.py create mode 100644 python/requirements.txt create mode 100644 src/python-runtime.ts delete mode 100644 src/services/elevenlabs-stt.ts delete mode 100644 src/services/elevenlabs-tts.ts create mode 100644 src/services/local-stt.ts create mode 100644 src/services/local-tts.ts create mode 100644 src/services/python-json-worker.ts create mode 100644 src/services/stt.ts create mode 100644 src/services/tts.ts create mode 100644 src/setup-local-ai.ts diff --git a/.env.example b/.env.example index 92ee550..7579972 100644 --- a/.env.example +++ b/.env.example @@ -2,15 +2,23 @@ DISCORD_BOT_TOKEN= DISCORD_APPLICATION_ID= DISCORD_COMMAND_GUILD_ID= -ELEVENLABS_API_KEY= -ELEVENLABS_VOICE_ID= -ELEVENLABS_STT_MODEL=scribe_v2_realtime -ELEVENLABS_TTS_MODEL=eleven_flash_v2_5 OLLAMA_BASE_URL=http://localhost:11434 OLLAMA_MODEL=qwen3:0.6b OLLAMA_KEEP_ALIVE=5m OLLAMA_NUM_CTX=4096 +LOCAL_AI_VENV_PATH=.local-ai/.venv +LOCAL_AI_CACHE_DIR=.local-ai/cache +LOCAL_AI_PYTHON= +LOCAL_STT_MODEL=tiny +LOCAL_STT_DEVICE=auto +LOCAL_STT_COMPUTE_TYPE=auto +LOCAL_STT_BEAM_SIZE=1 +LOCAL_TTS_LANGUAGE=KR +LOCAL_TTS_SPEAKER=KR +LOCAL_TTS_DEVICE=auto +LOCAL_TTS_SPEED=1.12 + BOT_DEFAULT_LANGUAGE=ko MAX_CONVERSATION_TURNS=12 LOCAL_AUDIO_SOURCE= diff --git a/.gitignore b/.gitignore index 9c97bbd..8d7204a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ node_modules dist .env +.local-ai diff --git a/README.md b/README.md index 4a6dbad..4879f0f 100644 --- a/README.md +++ b/README.md @@ -1,127 +1,138 @@ # realtime_voice_bot -디스코드 음성 채널 또는 로컬 PC 마이크/스피커에서 한국어 음성을 인식하고, 로컬 LLM 응답을 생성한 뒤 ElevenLabs TTS로 다시 읽어주는 최소 프로토타입입니다. +디스코드 음성 채널 또는 로컬 PC 마이크에서 한국어 음성을 인식하고, 완전 로컬 스택으로 답변을 생성한 뒤 다시 음성으로 읽어주는 최소 프로토타입입니다. + +## 현재 스택 + +- STT: `faster-whisper` + Whisper multilingual +- LLM: `Ollama` + `qwen3:0.6b` +- TTS: `MeloTTS` Korean +- VAD: `avr-vad` + +외부 유료 API나 무료 한도형 API는 쓰지 않습니다. ## 현재 구현 범위 - Discord slash command 기반 제어: `/join`, `/leave`, `/status`, `/reset`, `/say` -- 로컬 테스트 모드: `pw-record` 입력, `pw-play` 출력 +- 로컬 테스트 모드: PC 마이크로 직접 말하고 바로 응답 확인 - `@discordjs/voice` 기반 음성 채널 입장 및 유저별 오디오 수신 - 48k stereo PCM을 16k mono로 내려서 유저별 VAD 처리 -- Silero 계열 VAD(`avr-vad`)로 발화 시작/종료 감지 -- ElevenLabs Scribe Realtime WebSocket으로 발화 단위 STT -- Ollama 로컬 LLM으로 짧은 한국어 답변 생성 -- ElevenLabs Flash v2.5 스트리밍 TTS -- 채널 단위 단일 재생 큐 -- 사용자 발화 시작 시 현재 TTS와 대기열 중단(barge-in) +- 화자 발화 시작 시 현재 재생과 대기열 즉시 중단 +- Python 로컬 워커를 한 번 띄워 STT/TTS 모델을 메모리에 유지 -## 권장 환경 +## 필수 준비물 - Bun `1.3+` - Node.js `22.12+` +- Python `3.11+` +- `ffmpeg` - Ollama -- Discord bot with Voice permissions -- ElevenLabs API key + 사용할 Voice ID + +Discord 모드까지 쓸 거면 추가로: + +- Discord bot token +- Discord application id + +## 빠른 시작 + +```bash +bun install +ollama pull qwen3:0.6b +bun run setup:local-ai +``` + +그다음 로컬 장치 확인: + +```bash +bun run devices +``` + +실행: + +```bash +bun run start:local +``` + +Discord 모드: + +```bash +bun run start:discord +``` ## 환경 변수 -`.env.example`를 참고해서 `.env`를 채우면 됩니다. - -필수: - -- `ELEVENLABS_API_KEY` -- `ELEVENLABS_VOICE_ID` +`.env.example`를 복사해서 `.env`를 채우면 됩니다. Discord 모드에서만 필수: - `DISCORD_BOT_TOKEN` - `DISCORD_APPLICATION_ID` +기본값이 이미 들어있는 로컬 AI 설정: + +- `OLLAMA_BASE_URL` +- `OLLAMA_MODEL` +- `OLLAMA_KEEP_ALIVE` +- `OLLAMA_NUM_CTX` +- `LOCAL_AI_VENV_PATH` +- `LOCAL_AI_CACHE_DIR` +- `LOCAL_STT_MODEL` +- `LOCAL_STT_DEVICE` +- `LOCAL_STT_COMPUTE_TYPE` +- `LOCAL_STT_BEAM_SIZE` +- `LOCAL_TTS_LANGUAGE` +- `LOCAL_TTS_SPEAKER` +- `LOCAL_TTS_DEVICE` +- `LOCAL_TTS_SPEED` + 선택: - `DISCORD_COMMAND_GUILD_ID` - 테스트 서버에만 slash command를 즉시 반영하려면 설정 -- `OLLAMA_BASE_URL` - - 기본값: `http://localhost:11434` -- `OLLAMA_MODEL` - - 기본값: `qwen3:0.6b` - - 가장 빠른 무료 오픈웨이트 로컬 기본값 -- `OLLAMA_KEEP_ALIVE` - - 기본값: `5m` -- `OLLAMA_NUM_CTX` - - 기본값: `4096` +- `LOCAL_AI_PYTHON` + - Python 경로 자동 탐지가 안 되면 설정 + - 예시: `python` + - Windows 예시: `py -3` - `LOCAL_AUDIO_SOURCE` - - `pw-record --target` 에 넣을 PipeWire source id 또는 node name + - 로컬 입력 장치 + - Linux는 `pw-record --target`, Windows는 `ffmpeg dshow` 장치 이름 - `LOCAL_AUDIO_SINK` - - `pw-play --target` 에 넣을 PipeWire sink id 또는 node name + - Linux 로컬 출력 장치 + - Windows는 현재 시스템 기본 출력 장치 사용 - `LOCAL_SPEAKER_NAME` - 로컬 테스트에서 프롬프트에 넣을 화자 이름 -- `ELEVENLABS_STT_MODEL` - - 기본값: `scribe_v2_realtime` -- `ELEVENLABS_TTS_MODEL` - - 기본값: `eleven_flash_v2_5` +- `BOT_DEFAULT_LANGUAGE` + - 기본값 `ko` - `DEBUG_TEXT_EVENTS` - - `true`면 명령을 실행한 텍스트 채널에 transcript/reply를 같이 올림 + - `true`면 transcript/reply를 콘솔에 같이 출력 -## 실행 +## 속도 우선 기본값 -```bash -bun install +- STT 기본 모델은 `tiny` +- LLM 기본 모델은 `qwen3:0.6b` +- TTS 기본 속도는 `1.12` + +정확도가 아쉬우면: + +```env +LOCAL_STT_MODEL=small +OLLAMA_MODEL=qwen3:1.7b ``` -Ollama 준비: +## 로컬 테스트 순서 -```bash -ollama pull qwen3:0.6b -``` - -속도보다 품질이 더 중요하면: - -```bash -ollama pull qwen3:1.7b -# 또는 -ollama pull qwen3:4b -``` - -디스코드 모드: - -```bash -bun run start:discord -``` - -로컬 장치 목록: - -```bash -bun run audio:devices -``` - -로컬 테스트 모드: - -```bash -bun run start:local -``` - -타입 체크: - -```bash -bun run check -``` - -## 사용 흐름 - -1. 봇을 서버에 초대하고 음성 권한을 부여합니다. -2. 음성 채널에 들어갑니다. -3. 텍스트 채널에서 `/join` 실행 -4. 말을 하면 봇이 발화 단위로 인식하고 음성으로 짧게 답합니다. -5. 다시 말하면 현재 읽고 있던 TTS는 즉시 중단됩니다. - -로컬 테스트: - -1. `bun run audio:devices` 로 source/sink id 또는 이름 확인 +1. `bun install` 2. `ollama pull qwen3:0.6b` -3. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE`, `LOCAL_AUDIO_SINK`, `OLLAMA_MODEL` 설정 -3. `bun run start:local` -4. 마이크로 바로 말해서 응답 확인 +3. `bun run setup:local-ai` +4. `bun run devices` +5. 필요하면 `.env` 에 `LOCAL_AUDIO_SOURCE` 설정 +6. `bun run start:local` + +## Windows 메모 + +- `bun run devices` 와 Windows 로컬 녹음은 `ffmpeg`가 필요합니다. +- 출력 장치 직접 선택은 아직 미구현이라 시스템 기본 출력 장치로 재생됩니다. +- Python 탐지가 안 되면 `.env` 에 `LOCAL_AI_PYTHON=py -3` 또는 `LOCAL_AI_PYTHON=python` 을 넣으면 됩니다. ## 설계 메모 @@ -129,5 +140,4 @@ bun run check - 출력은 길드 세션당 단일 큐 - 로컬 모드는 단일 화자 입력 기준 - 화자 구분은 `speaker_id`, `speaker_name`을 LLM 프롬프트에 항상 포함 -- 현재 기본 LLM은 `qwen3:0.6b` 이며 속도 우선 설정이라 답변 품질이 약하면 `qwen3:1.7b` 또는 `qwen3:4b` 로 올리는 것을 권장합니다. -- STT/TTS는 아직 ElevenLabs API를 사용하므로 프로젝트 전체가 완전 무과금은 아닙니다. +- 모델 다운로드 캐시는 기본적으로 `.local-ai/cache` 아래에 저장 diff --git a/bun.lock b/bun.lock index aa5c27c..700a171 100644 --- a/bun.lock +++ b/bun.lock @@ -12,7 +12,6 @@ "ffmpeg-static": "^5.3.0", "opusscript": "^0.1.1", "prism-media": "^1.3.5", - "ws": "^8.20.0", "zod": "^4.3.6", }, "devDependencies": { @@ -22,6 +21,7 @@ }, }, "trustedDependencies": [ + "ffmpeg-static", "onnxruntime-node", ], "packages": { diff --git a/package.json b/package.json index c59611c..64cc66f 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "start": "bun src/index.ts discord", "start:discord": "bun src/index.ts discord", "start:local": "bun src/index.ts local", + "setup:local-ai": "bun src/setup-local-ai.ts", "devices": "bun src/index.ts local-devices", "audio:devices": "bun src/index.ts local-devices", "check": "tsc --noEmit", @@ -25,7 +26,6 @@ "ffmpeg-static": "^5.3.0", "opusscript": "^0.1.1", "prism-media": "^1.3.5", - "ws": "^8.20.0", "zod": "^4.3.6" }, "devDependencies": { diff --git a/python/__pycache__/local_stt_worker.cpython-314.pyc b/python/__pycache__/local_stt_worker.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4aeda5239e814fcd02cc734dc39fc4e3370a182a GIT binary patch literal 9111 zcmb_ieN06*MSnyz-FRsUECCh5>^ zrPZEuAI~PllXRrL#P{9rd+xpGp7T5BW_M``PaypFi9f|Yb`kQQm@xvkmbm{XNKBF; zA~4-V)xV6&(7Z`C(J!mA^vkIn{hC!X{aRGZSyQmo7_#sUx(jI|C71-Z)uiWYiNI|l zf_W2(yHva27A%mnVlK|8ykLWzoz^)79?}k)c2<%)A~+#mVvM7f2&IrOqxGd{%LLc{ zGlWEeB0o9nsv~Dh>qzSw0j-RlIL8RgHquc36=wT1SMcS~@&3N<{?5Lm{T=JsX|dI*_67ho=Ya0;>k!jE-I=jjwj^{lH8cOtT{z7oJb_q zuquh7?1U*w;HOkUG)Yzo!!)m8^nS%Mex^M%cOPmAT{|W)Sl~z3{sJT>$)K?(unU80 z$4N+#!=?`h0a)re>!6^GH0coa<{~HFHXiB~;jw%Zg@E)TkuG z*2p+5-IpD*1CkmWPmanSNYO=IFarUX5AF$W?Yy50|j%$*!?xBOTK>;Pck9K&{ z!4(!Hu!H#}3EUnlbYBaCS>T#DSd2wBLy^YI((7qSQN>tPGbJx*W}rVQ_sQ4=R>$Jn zxZz|nF59t$k?~IICr(#JPliT#X_UPX;qf?_O)D6Epe*C(R~-G^+V(kjdlyY+&J5B9 z`l9p1LU){*F8Y7zSOT1szzi@=rW7~8?1VB5QP^NVIbAQ9c0lfAS%^KDuN#HlempaF zfD<^ZFK~A-f95!f9)h?L8^RzSAPqr0gbksNz{qC7^4!{Jf_2Zj7RFVrebT(h85@Fe z-GO==9Fko##iH^iXoS~KNypWRl62xiA~~K2(1VgULtPi#MW>Wd(y}Cml}Ie6F?=5! zFtQot$z)p9%;R!Ql{D_Wl1ymkXnHKA$RJfogIV^%hfX5bU zAXnXzscxC+pR&GVowsfNBWHcH=f^!c&NI(>-Va@U<-J#CtaH*r<)Itu7nMgBxQ?au zE&iSJK6*>=5GcdcQxYD3OPZK%b2f-%+*+5f)LPhhd10Ma8s2J?E;LZZMUc;RYjU5p%LYe#G4FhP*OgswH(qJX_;$|@Wck({e_)~F@P6Jb^R@3KH}^W)53x57G7vwc0rLFROo@CvZwsaZ zzfl#=>N4JMquVpPv`3iyP7ocU~^Q#J)D_lWnOGBQ_UqcqO>{>Bs*6%5A z|6kZ1Sa}261F~7i_Hgw? zx`SIM3y4GW<|l4k4-}t=zKRE;ZxDr)n)wxsrt9hFQNG2gfD>%@JwUNxMjU5S4CrnS z>Sze&v4Bwmd|gERt~U{akJ0T#kqSxVXBjLr<}rlu07>Lg$RO$>8U{!Z75Ii=pbzMS ztD|8M5rm%l>sJLx$52S?#zsI+!vF(lhy?R_qnu!A5&EDM&u2pbEQM+l0??gTM}vq* zXb`N$=Wqf}Dn1X?*o)5vH9XWH%-Il}WS`AgBOovbaHms7UPgd*`dmzbn|d-CmEu}i zL0p{d@96B+I5n2ItW^|>k94-56i;=((y1{|uL%Rqx&F>ic&XX57tEdj*hkUJfiWEV z@PsX(%j?4v_P|<)!xNUkuoNCU+}BX1m5ziJRg%SVogf)SMlm_0@pM2j5gwDMw$mjN ziW<2GBOgSX9mf@w*aR3gv^_Ob(9lt&_y`^RA^Bw!#aJQ+cTy`~e+Bh2di{O}!%mXl z+nWAZ<-fk`+OF%3*BUbo2X1s_tB&TXy63C9vsEW@RWD_#Ui$pv6!%uylD9tV-2o}b zlD~P*^AFV@S7-f)r?>^1Z^`HXK)C+OwO6hWT^q{mY5$}pTXQVyI}VLY?^f0=I$dw~ zU+G`0*mU*ay9XD2H96m|dEc%FR?f>q6aE2D?9Ro~`kBx##Sg_y{WEiCZbbj(!arWP zarX0uOz2E5B+ZAU%&C#gNIWz4dT#88^J72E#NWs`fAnY8WakSK!>}9imQsLs01|&m zvYldH1wBJ!_95eo!?GsOI-ofKG=Wk9V>y--hr^2WY^!FAhZCb|Ft&C3OUI_VK$MR_ z^GC#>d=yH%v1SKEROW3l`&I}1Z5TEZTAmA z{3PuSx1BNWNQxCmimQHttOU0QgR;@EmJTu^lQyMUG4I?+nFq|p?5q^nZ}```MlDF? zyi6-5adf-PkXA+Ju?$#(dFe{mPNSjxifRe8DB}JZG8t>8E%GwBFos}xs6}O=#z+mY zg%%sv1@Z{3uni%IH*`Z7;08?48vP&}!VvDVNw5kw!T#Kuz8DATC0`QwU|y0II-z4~ z!0O7MfY562XnspexFv$_E zDw0e@V`$2V@Gm8UnUhQ_qAFcdHU45aDuM$F#J^TP9s}oDS{2c;rl?^s=rxBbje(8| zgQHH;m;l9O#%y?pTHtqo8ltVR+mj?jy4nbN-5rFIjUl*p=hl}STww5k?rAvz`vXSg z`i5)i$Bl7`Oqy%S2@q>`vV{o18PBj_G%|gtdT_B6OGC!g*sNI?3Vs1Io(`=L=O zAzezzFK9aps)*W(5ji$2H6Ef0R(YW@-!Ke>Da{al1*^|ghi}^EPXAl)zxL*Ohvs{S za=ouD_?xGXUGKftoAWo$`x|rq=8V7jQ``S2D0hBR*UmGuTOn$w&@5{*+dlkkYgCGW z>eg(y`_w&H-Um828iH(64$GIj&}GP>9@aQu4~UZ~DWN$PNliq~ftyo`D>SFTDUQdY z>N%Plkx_Gl*{eCI*M;U48J&R|Us$KkKiv6bETP*UXp?A`bRr&0T+pojr#jK4*I<|X zp-H^qyjQ#5>z~WsN^H87nByM9$RtQ9IMd(KdWzZlW@RFejzJCYeNmR6OhxA~G;S?Fnn_b1; zv0S8+>EZ}UnL?&wpe8>K)Nc^!V7J4J=%AvZn8{X0LU~Y>Vhbi_h=mu2b@0Sc|0xb? z%xO+P3t89iODA)}sw>(d3c(65MF5-N;`vF68AOAG27wsV0^lx`t^>*IN=?PQ)Ps3< zMKBN8jnN6bgNE{B88aDQ>P2Q#%!|wrTrGj7VvGT?0E_`-)B{j{MyFhO5)E68K80@` z8QgB<3e-SqZG3~C$DK*aR@n73oZt%C1h?VFD69ka{x*4JB@N{h+egBnUZVju89?hf z7LJ1v9-x>d050=@{uVm%f}BnSUK@r8695^~sRO#_ARK!wKxJqkHUd5afD}v+~#h74|k}$y;^x{CnqT zJm~N^2HuV7(=(yJKb!UKp6Xew^iQApqH@Pn7o@Jf`tGYUn`ifBD;uY}zAUSp_AZpw zF1fZ$Pt3N??Yz-CSG(Y9zw6q(2vz=tvh7RWKxW%>H!O3PGIecPZ+pho{$+(PS5ZG- zQNL6ZnDPFy>cgtpz8gofbuVUX+HmFp^sCJB*_xJI&GVU>=jY_C??BGiHt%c8`i|s$ zJsDrmCsBYdp6&Mu>nWY;cwi$X)zb%N)Y*}_=x?p?e_hqi{mhI5(y9fIL z(>x$1*r-Kc_4Q5HHvPhvt82Sc*LKVEN#s-C=SQ=3!L08GIp67d-|4LH<*A+}XB7~X zp6kguUtDm$_}#J@dw;8Rz$yKN@7PKHx!u>rnSSeS_rmAXz10xkvTwq8r=yEuZ|!U6 zIuEh851AqJ8Rs~*!~9uUyW?1$`FC|@$RL)%V+XKd>g#K8%Ol7w$f&x@4OOIk4x=fI zE?|Vu0U8fCH7bpS;muaR56H&48;*M5C{RIOlHZ`sPhAG5-dN|Qm@2npDLO%QwzD15 zA=C>pT5iB}grYuxiwtv*l-whZd&GW^@b^gBJ>r7wvV}10oBWUY9I2cql^O3-vpW~a zzOOCtA0<|8@ErfOAKoy(4#11fYIzd_{vwFJjxqp8mU|h(Ij1HTSl?aV^KR(p@DAqB NaQ+pRa8;D1{{w-{#<&0g literal 0 HcmV?d00001 diff --git a/python/__pycache__/local_tts_worker.cpython-314.pyc b/python/__pycache__/local_tts_worker.cpython-314.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e12fcdbcef2dd6fbca5f22176a86138a261847b4 GIT binary patch literal 7848 zcmbVReQZ-#mcQ@A-%e~3+liAvoCE@flJJoh8om9ooW&dNf(r#5~m;ctr={S94G_1B-Y5rI#EX)M8(rVAS z&a+cDg>4Vv-uv!7_uO;8&iS3|ZPis)0`%X<{zvMrCgeZyLnnTv@ZEV{0azD+6^h#lj>b zFs&p|{YU&9)||mNLq|^x51!~B?mf{rID8~+NU0Os_l_g`N*98Q!NLD0TOgSsBjhZ* z_gxqW>Ke)G42m3umLlPVI#?Kj|07OIGfjz#5;-SoMp;x-a(pE1s zaGVzHwT{Wel-MyIi=?7rNB>Z7EOM#ioSaA{l@3{q#8hcY3|~x0<6?L$p+Z$M(UF{1 zClm3GSYkX93#+OUzLbzJh;n;!TC<14k$60zMpQ8zmTfQu?ub$k$qZR0471}gCLh-< zVRpk!zn(xVp>xLqk`eyXxgUhW3>nqe1Xf{mW!ne|^2qvq65z<}!Ja^{vA7~!D8Op` zgcK7s!$d5lOb!P)&5)F(xC%=$t!PF?jfyfXjf}(6b!n6FkZN2kaZav*5}m|4D9n&W z-jdn;33tnS(>k}Yz<0>l_PvcWLj_fSn09#4#+4@|aHFLu3H(bY=)OXNLEt-hn2b?2 zK$T{d#dlJoqK2iY#wIRk2GBnt56jpEX2;>&__0JHCZjSiGYe$J9j7y+J3|vEHf)Ck zx5viRKE~t|WeKyt=V^AY^*;Z_R$9!yF-jSVlP)O>-Em|(>Hnp&1lTEo8DTouBtODz zhB~AuEN~+^wNYT(;OjL;)#geo88)rmIr%3-QFI zxSwv6TnlXja2M@jTuI4dIHHV8lEzqvVS`p%EuBKwbpiD9mS44RuG}E1rLScqHGTSrf z7a$p1GH?##%#md?F8gN0O-s5GvI!wL#)nJM2S&089{eYF8>0zSnJ z$uqoN9)tBUCO0u!MNMD`>Tg z3qI$}u6YAVyfn(s7@03S*Pk(R$?W}%k>PghZu%#V496&5vKgaj^C|rd9ZZ5**Fm=_ zAe(|76gk3x9U^N16{y8KSwVtDw^tygw2#1x_N9V?brlux8ARV(K|!<_6_jL0*tO0P zj0Di8l2*;E1h0S-FIWXl>_of1wv=FV1v7l=wlvB8h(6p;7<&tjJ=&^|GP+4Gfo$C= z=~b)7)<@8DSozW{M+15X!3MKJ?_WW7BG}iSFSJxa%Nn~3E!EJ1wxWXUWS3dj1N@5> zY?MPr1B8e!6gsJ`JAR_K=Xe;3;p08SNB*McNWaD&6Es)3{$#Mf=UBg>Im=aj{Ud|D z{Td%hsR_+dt_RlszBIqHedjKKP3Bl6elA6EWK_HeAaFSCqp)u~z&=0V!|1eMXE~6z z_?4s>0VH@JZT1)W52Q_4DMk+jJeqk*j3wG5Nl8Y>3&2IuOk(_^B!h3`&WYgd0HI1r zc{jG&R%C@Qh|@A23Pf*-2r{8O%`laSiZP9wOez{5ji?d8amfq201#RAo=`TVl1Wk4 z__%l(5D~aiSuRqUJ+vxMTJU3N>N3##t|mRq_h}S(D=6b{VUGY91cry z3H*SzZnfDj)=`s>NNa}t-t7OTt{#Ent?HZA+0Fazgz|NV3w1|xbw~4cLxnmaTPNI) zX7~@P78};z>blu=>)_3U*{uiesQHGzLc>t5VJP2lywDKJ!e20y;pfeDi{85H=dYc= z9={gPZrXFloA-7XyaPG!K;C<_;1#l7;r;+fv@Lq-KiO55x$n*h$jp2Cq0Raqp1RqM zf4BWN+wbuIYWpYKyr&Oq7rgbl2*7OFST@&8cu+p5{WzAb&m7rKMF?qK%N>)F@e z%$`15I4$N*i`h3PvJ+F;xLk-|&c!chr>3*^^q)D_Vl8`AJzl_#qn;n}D4|V?k0<|+lAW6321yB?sQwH6Y9C0PML zw!nAd;s7Fy!LrG7XER=4R<9~ypGKaANZtz}xl5wm5}+Dcl9pKK$#jB~ILjYMLb$a+{G{CE7L zecXB?F@7>ya=u0qg+qpc%X;W95uM|Ca$e(#Tn!{t))-9U|3~A{Dvbq*2rG1fbuQ|H zZMrr^zExwvDgCKqNtGj&W-XP*(Z>szHInArFGVg6fDbV$sfh{cvStKxEvn)$gyZsl z*b~k0PD%o!&&p!~yJl9!sU!{3*9}Le#ApaU6LLhJ9>Bnzk3*e&0DIZM^Mutz7T0#RNmPWFu9r zPl>Ur`l*ww^Inx^-33?MysK@lZJ~Dk^}2U9N)uAobhqnlgUp-tug*b_4CB z>3trXKCC*WVW)pH>~-Z}`U^Clo(toV|X-=PoOjH+p*= z?C1M!y>`o8lNoC6I&8h`4R;$EO#Qar9hSS>X#D|ipNHJ@*!nhd_gWcD1D5_q?%r;1 zzm@;O!a(^8tF6Dz^o57TaxH`9Iv&apSEJ>i(1M<$;?e2CIy{WSaAv`;<1qujZViV4 zU8G`Ih8L}OQjyq7iz5sNPgYboT4Fo_Ph5j2;VdMY2LeNQ_malL5fxy}SPGm4^mnZa z9mmC3EF6aC80tLW@R06e5Q*+d#smSh0>+Cxdk;?5G7I7262^?n?ch89j1Gi)4$8ZB zjJpv!uN~ozLQ9coq|25gUC;<4hv&T$oq%=)!y=4iw=z|aG5KJeW<_UK6kSX$kWs(; zUy#nAA;!s4(hYc$jFz0mS`mhm^fLnxc_i5o8+?u(37 zctR<}!Nc3Ch)H)3Ngi4Zt6P$W5EsOqsvyDGL3Fp4G=>ad*EEQSvx1sQs1)C<6_W`LUGnO+>SvC@ znXtfpIZNGk`4JTyauBsoGNUV?g8-P)WdHyv*D`C?vO!aYitssFdK*{^`oJ7YvUHhx zj=(_RlFT9IEYrGDk{I1pg8K%TQ6$hDen>#gg%{}nacw0vlhJK@U0E9_t;pB)dF~mo zl=topUT}sif~)+RuFnJ3{@-Cu>72?l39L(Rnuvf?j{4PvAA=!@Mo)ekFZv-o^%s98 zNFS6_asS(6kYGW9-zSn?#h@}Gz3r#w)h|sz00|yFD#phpQSnD)(LW_AiWEO5qbVAe z@hWM?DG`2(h-$`UM2<{BAeaP(kY=EfDK*P!cM6_ZlYk{u(5%Sb<)izwnNotlBZ$ifpXeOC>5slnAO{g=nZq$(xiG!BO+I-Mz%uTe_J=j5R+zKkLqW+cHNMeT_Fx&4&K=Oy0K*LPt;I zjW@pXv}FdMbp5SsZ_U=t?aF)FGXswto*UkIN7JHn!;SP@=k3jRI&U}4JA1x%)-6C& z$Xm-8*J z7FxPe|(rB_Nr%3PeA5*N_bcmvn^}iJ#X%$h2!() z*PxI&eC5ddM?M}bxZ5AN+vk+Ldw1U6Id6XX_YUWGOBU$%-O~XE5Y$t`!h$XM8gA9z zto_tiXzqT{+BijuB zw#@(^2#j#c0lBcl!+~l!1;Xe#rbwIqU?GRFE{Vw=NZ<`dgdf!=BJkS>y(;v8(@R7T zj{G)dIytQX5bwV%sWMu$BDBFHrr0OBDJtUTg#QKmp z9ug;DYKD8y`b%qpcyh#(^=_HlJWqE0$q2tOE}P+(tmVdb2EuSi{xrq_a9KLd5Z<0i b&vU-7t?p}~k4Ey=#w_3Xm?L~WRpb8vM^48d literal 0 HcmV?d00001 diff --git a/python/local_stt_worker.py b/python/local_stt_worker.py new file mode 100644 index 0000000..a94783f --- /dev/null +++ b/python/local_stt_worker.py @@ -0,0 +1,145 @@ +import base64 +import json +import os +import sys +import tempfile +import traceback +import wave + + +os.environ.setdefault("PYTHONIOENCODING", "utf-8") + + +def log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None: + payload = { + "id": request_id, + "ok": ok, + } + if ok: + payload["result"] = result + else: + payload["error"] = error or "unknown error" + + sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n") + sys.stdout.flush() + + +def resolve_device() -> str: + raw = os.environ.get("LOCAL_STT_DEVICE", "auto").strip().lower() + if raw and raw != "auto": + return raw + + try: + import ctranslate2 + + if ctranslate2.get_cuda_device_count() > 0: + return "cuda" + except Exception: + pass + + return "cpu" + + +def resolve_compute_type(device: str) -> str: + raw = os.environ.get("LOCAL_STT_COMPUTE_TYPE", "auto").strip().lower() + if raw and raw != "auto": + return raw + if device == "cuda": + return "int8_float16" + return "int8" + + +class SttWorker: + def __init__(self) -> None: + from faster_whisper import WhisperModel + + self.model_name = os.environ.get("LOCAL_STT_MODEL", "tiny").strip() or "tiny" + self.device = resolve_device() + self.compute_type = resolve_compute_type(self.device) + self.beam_size = int(os.environ.get("LOCAL_STT_BEAM_SIZE", "1")) + self.model = WhisperModel( + self.model_name, + device=self.device, + compute_type=self.compute_type, + ) + log( + f"local-stt ready model={self.model_name} device={self.device} compute={self.compute_type} beam={self.beam_size}" + ) + + def transcribe(self, audio_base64: str, language: str | None) -> str: + pcm_bytes = base64.b64decode(audio_base64) + temp_path = "" + + try: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle: + temp_path = handle.name + + with wave.open(temp_path, "wb") as wav_file: + wav_file.setnchannels(1) + wav_file.setsampwidth(2) + wav_file.setframerate(16000) + wav_file.writeframes(pcm_bytes) + + segments, _info = self.model.transcribe( + temp_path, + language=language, + beam_size=self.beam_size, + best_of=1, + condition_on_previous_text=False, + vad_filter=False, + without_timestamps=True, + temperature=0.0, + ) + return " ".join(segment.text.strip() for segment in segments if segment.text.strip()).strip() + finally: + if temp_path: + try: + os.unlink(temp_path) + except OSError: + pass + + +def main() -> int: + try: + worker = SttWorker() + except Exception as exc: + log("failed to initialize local STT worker") + log("run `bun run setup:local-ai` first if dependencies are missing") + log("".join(traceback.format_exception(exc))) + return 1 + + for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + request = json.loads(line) + request_id = int(request["id"]) + method = request["method"] + params = request.get("params", {}) + + if method == "ping": + write_response(request_id, True, {"ready": True}) + continue + if method != "transcribe": + raise ValueError(f"unsupported method: {method}") + + text = worker.transcribe( + audio_base64=str(params.get("audio_base64", "")), + language=str(params.get("language") or "").strip() or None, + ) + write_response(request_id, True, {"text": text}) + except Exception as exc: + error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip() + write_response(request_id, False, error=error_text) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/python/local_tts_worker.py b/python/local_tts_worker.py new file mode 100644 index 0000000..10a25e9 --- /dev/null +++ b/python/local_tts_worker.py @@ -0,0 +1,125 @@ +import base64 +import json +import os +import sys +import tempfile +import traceback + + +os.environ.setdefault("PYTHONIOENCODING", "utf-8") + + +def log(message: str) -> None: + print(message, file=sys.stderr, flush=True) + + +def write_response(request_id: int, ok: bool, result=None, error: str | None = None) -> None: + payload = { + "id": request_id, + "ok": ok, + } + if ok: + payload["result"] = result + else: + payload["error"] = error or "unknown error" + + sys.stdout.write(json.dumps(payload, ensure_ascii=False) + "\n") + sys.stdout.flush() + + +class TtsWorker: + def __init__(self) -> None: + from melo.api import TTS + + self.language = os.environ.get("LOCAL_TTS_LANGUAGE", "KR").strip() or "KR" + self.speaker_key = os.environ.get("LOCAL_TTS_SPEAKER", "KR").strip() or "KR" + self.device = os.environ.get("LOCAL_TTS_DEVICE", "auto").strip() or "auto" + self.speed = float(os.environ.get("LOCAL_TTS_SPEED", "1.12")) + + self.model = TTS(language=self.language, device=self.device) + speaker_ids = self.model.hps.data.spk2id + self.speaker_id = speaker_ids.get(self.speaker_key) + + if self.speaker_id is None: + normalized = self.speaker_key.upper() + self.speaker_id = speaker_ids.get(normalized) + + if self.speaker_id is None: + self.speaker_id = next(iter(speaker_ids.values())) + + log( + f"local-tts ready language={self.language} speaker={self.speaker_key} device={self.device} speed={self.speed}" + ) + + def synthesize(self, text: str) -> bytes: + temp_path = "" + + try: + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as handle: + temp_path = handle.name + + self.model.tts_to_file( + text, + self.speaker_id, + temp_path, + speed=self.speed, + quiet=True, + ) + + with open(temp_path, "rb") as handle: + return handle.read() + finally: + if temp_path: + try: + os.unlink(temp_path) + except OSError: + pass + + +def main() -> int: + try: + worker = TtsWorker() + except Exception as exc: + log("failed to initialize local TTS worker") + log("run `bun run setup:local-ai` first if dependencies are missing") + log("".join(traceback.format_exception(exc))) + return 1 + + for line in sys.stdin: + line = line.strip() + if not line: + continue + + try: + request = json.loads(line) + request_id = int(request["id"]) + method = request["method"] + params = request.get("params", {}) + + if method == "ping": + write_response(request_id, True, {"ready": True}) + continue + if method != "synthesize": + raise ValueError(f"unsupported method: {method}") + + text = str(params.get("text", "")).strip() + if not text: + raise ValueError("text is empty") + + audio = worker.synthesize(text) + write_response( + request_id, + True, + { + "wav_base64": base64.b64encode(audio).decode("ascii"), + }, + ) + except Exception as exc: + error_text = "".join(traceback.format_exception_only(type(exc), exc)).strip() + write_response(request_id, False, error=error_text) + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000..ccdfa62 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,2 @@ +faster-whisper==1.2.1 +git+https://github.com/myshell-ai/MeloTTS.git@v0.1.2 diff --git a/src/audio/guild-voice-session.ts b/src/audio/guild-voice-session.ts index fb93718..78bf6e7 100644 --- a/src/audio/guild-voice-session.ts +++ b/src/audio/guild-voice-session.ts @@ -22,9 +22,9 @@ import type { AppConfig } from "../config.js"; import { Logger } from "../logger.js"; import { float32ToPcm16Buffer, int16ArrayToFloat32, Stereo48kToMono16kDownsampler, takeFrame } from "./pcm.js"; import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; -import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; -import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js"; import type { LlmService } from "../services/llm.js"; +import type { SttService } from "../services/stt.js"; +import type { PreparedSpeechAudio, TtsService } from "../services/tts.js"; interface GuildVoiceSessionOptions { client: Client; @@ -33,8 +33,8 @@ interface GuildVoiceSessionOptions { guild: Guild; voiceChannel: VoiceBasedChannel; textChannelId?: string; - stt: ElevenLabsSttService; - tts: ElevenLabsTtsService; + stt: SttService; + tts: TtsService; llm: LlmService; } diff --git a/src/audio/local-voice-session.ts b/src/audio/local-voice-session.ts index 47c8148..8541e7c 100644 --- a/src/audio/local-voice-session.ts +++ b/src/audio/local-voice-session.ts @@ -12,15 +12,15 @@ import { Logger } from "../logger.js"; import { requireFfmpegPath } from "./ffmpeg-path.js"; import { takeFrame, int16ArrayToFloat32, float32ToPcm16Buffer } from "./pcm.js"; import { ConversationMemory, type UserUtterance } from "../services/conversation.js"; -import { ElevenLabsSttService } from "../services/elevenlabs-stt.js"; -import { ElevenLabsTtsService, type PreparedSpeechAudio } from "../services/elevenlabs-tts.js"; import type { LlmService } from "../services/llm.js"; +import type { SttService } from "../services/stt.js"; +import type { PreparedSpeechAudio, TtsService } from "../services/tts.js"; interface LocalVoiceSessionOptions { config: AssistantRuntimeConfig; logger: Logger; - stt: ElevenLabsSttService; - tts: ElevenLabsTtsService; + stt: SttService; + tts: TtsService; llm: LlmService; } diff --git a/src/config.ts b/src/config.ts index cd33135..c814ac1 100644 --- a/src/config.ts +++ b/src/config.ts @@ -15,14 +15,21 @@ const envSchema = z.object({ DISCORD_BOT_TOKEN: emptyToUndefined, DISCORD_APPLICATION_ID: emptyToUndefined, DISCORD_COMMAND_GUILD_ID: emptyToUndefined, - ELEVENLABS_API_KEY: emptyToUndefined, - ELEVENLABS_VOICE_ID: emptyToUndefined, - ELEVENLABS_STT_MODEL: z.string().min(1).default("scribe_v2_realtime"), - ELEVENLABS_TTS_MODEL: z.string().min(1).default("eleven_flash_v2_5"), OLLAMA_BASE_URL: z.string().min(1).default("http://localhost:11434"), OLLAMA_MODEL: z.string().min(1).default("qwen3:0.6b"), OLLAMA_KEEP_ALIVE: z.string().min(1).default("5m"), OLLAMA_NUM_CTX: z.coerce.number().int().min(512).max(32768).default(4096), + LOCAL_AI_VENV_PATH: z.string().min(1).default(".local-ai/.venv"), + LOCAL_AI_CACHE_DIR: z.string().min(1).default(".local-ai/cache"), + LOCAL_AI_PYTHON: emptyToUndefined, + LOCAL_STT_MODEL: z.string().min(1).default("tiny"), + LOCAL_STT_DEVICE: z.string().min(1).default("auto"), + LOCAL_STT_COMPUTE_TYPE: z.string().min(1).default("auto"), + LOCAL_STT_BEAM_SIZE: z.coerce.number().int().min(1).max(8).default(1), + LOCAL_TTS_LANGUAGE: z.string().min(1).default("KR"), + LOCAL_TTS_SPEAKER: z.string().min(1).default("KR"), + LOCAL_TTS_DEVICE: z.string().min(1).default("auto"), + LOCAL_TTS_SPEED: z.coerce.number().min(0.8).max(1.6).default(1.12), BOT_DEFAULT_LANGUAGE: z.string().min(2).default("ko"), MAX_CONVERSATION_TURNS: z.coerce.number().int().min(4).max(30).default(12), LOCAL_AUDIO_SOURCE: emptyToUndefined, @@ -36,10 +43,7 @@ const envSchema = z.object({ }); export type AppConfig = z.infer; -export type AssistantRuntimeConfig = AppConfig & { - ELEVENLABS_API_KEY: string; - ELEVENLABS_VOICE_ID: string; -}; +export type AssistantRuntimeConfig = AppConfig; export type DiscordRuntimeConfig = AssistantRuntimeConfig & { DISCORD_BOT_TOKEN: string; DISCORD_APPLICATION_ID: string; @@ -57,11 +61,7 @@ function requirePresent(value: string | undefined, name: string): string { } export function requireAssistantRuntimeConfig(config: AppConfig): AssistantRuntimeConfig { - return { - ...config, - ELEVENLABS_API_KEY: requirePresent(config.ELEVENLABS_API_KEY, "ELEVENLABS_API_KEY"), - ELEVENLABS_VOICE_ID: requirePresent(config.ELEVENLABS_VOICE_ID, "ELEVENLABS_VOICE_ID"), - }; + return config; } export function requireDiscordRuntimeConfig(config: AppConfig): DiscordRuntimeConfig { diff --git a/src/discord-main.ts b/src/discord-main.ts index f06045d..88cde72 100644 --- a/src/discord-main.ts +++ b/src/discord-main.ts @@ -15,8 +15,8 @@ import { Client as DiscordClient } from "discord.js"; import { GuildVoiceSession } from "./audio/guild-voice-session.js"; import { type DiscordRuntimeConfig } from "./config.js"; import { Logger } from "./logger.js"; -import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; -import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; +import { LocalFasterWhisperSttService } from "./services/local-stt.js"; +import { LocalMeloTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger): Promise { @@ -37,11 +37,14 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger intents: [GatewayIntentBits.Guilds, GatewayIntentBits.GuildVoiceStates], }); - const stt = new ElevenLabsSttService(config); - const tts = new ElevenLabsTtsService(config); + const stt = new LocalFasterWhisperSttService(config, logger); + const tts = new LocalMeloTtsService(config, logger); const llm = new OllamaLlmService(config); const sessions = new Map(); + await stt.warmup(); + await tts.warmup(); + function getVoiceChannel(interaction: ChatInputCommandInteraction): VoiceBasedChannel | null { const member = interaction.member as GuildMember | null; return member?.voice.channel ?? null; @@ -174,6 +177,7 @@ export async function runDiscordBot(config: DiscordRuntimeConfig, logger: Logger }); } sessions.clear(); + await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]); await client.destroy(); process.exit(exitCode); } diff --git a/src/local-main.ts b/src/local-main.ts index 124e57f..ebd2c5a 100644 --- a/src/local-main.ts +++ b/src/local-main.ts @@ -5,8 +5,8 @@ import type { AssistantRuntimeConfig } from "./config.js"; import { Logger } from "./logger.js"; import { LocalVoiceSession } from "./audio/local-voice-session.js"; import { requireFfmpegPath } from "./audio/ffmpeg-path.js"; -import { ElevenLabsSttService } from "./services/elevenlabs-stt.js"; -import { ElevenLabsTtsService } from "./services/elevenlabs-tts.js"; +import { LocalFasterWhisperSttService } from "./services/local-stt.js"; +import { LocalMeloTtsService } from "./services/local-tts.js"; import { OllamaLlmService } from "./services/ollama-llm.js"; export async function printLocalAudioDevices(): Promise { @@ -67,9 +67,13 @@ export async function printLocalAudioDevices(): Promise { } export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: Logger): Promise { - const stt = new ElevenLabsSttService(config); - const tts = new ElevenLabsTtsService(config); + const stt = new LocalFasterWhisperSttService(config, logger); + const tts = new LocalMeloTtsService(config, logger); const llm = new OllamaLlmService(config); + + await stt.warmup(); + await tts.warmup(); + const session = new LocalVoiceSession({ config, logger, @@ -91,6 +95,7 @@ export async function runLocalAssistant(config: AssistantRuntimeConfig, logger: await session.destroy().catch((error) => { logger.warn("Local session shutdown failed", error); }); + await Promise.allSettled([stt.destroy?.(), tts.destroy?.()]); process.exit(exitCode); }; diff --git a/src/python-runtime.ts b/src/python-runtime.ts new file mode 100644 index 0000000..48d4bcb --- /dev/null +++ b/src/python-runtime.ts @@ -0,0 +1,90 @@ +import { existsSync } from "node:fs"; +import { spawnSync } from "node:child_process"; +import path from "node:path"; + +import type { AppConfig } from "./config.js"; + +export interface PythonLaunch { + command: string; + args: string[]; + source: "venv" | "configured" | "system"; +} + +function splitCommandSpec(spec: string): string[] { + return spec.match(/(?:[^\s"]+|"[^"]*")+/g)?.map((part) => part.replace(/^"|"$/g, "")) ?? []; +} + +function canRun(command: string, args: string[]): boolean { + const result = spawnSync(command, [...args, "--version"], { + encoding: "utf8", + }); + return result.status === 0; +} + +export function resolveLocalAiVenvPath(config: AppConfig): string { + return path.resolve(process.cwd(), config.LOCAL_AI_VENV_PATH); +} + +export function resolveLocalAiCachePath(config: AppConfig): string { + return path.resolve(process.cwd(), config.LOCAL_AI_CACHE_DIR); +} + +export function resolveVenvPythonPath(config: AppConfig): string { + const venvPath = resolveLocalAiVenvPath(config); + return process.platform === "win32" + ? path.join(venvPath, "Scripts", "python.exe") + : path.join(venvPath, "bin", "python"); +} + +export function resolvePythonLaunch(config: AppConfig, options?: { preferVenv?: boolean }): PythonLaunch { + const preferVenv = options?.preferVenv ?? true; + const venvPython = resolveVenvPythonPath(config); + + if (preferVenv && existsSync(venvPython)) { + return { + command: venvPython, + args: [], + source: "venv", + }; + } + + const configured = config.LOCAL_AI_PYTHON ? splitCommandSpec(config.LOCAL_AI_PYTHON) : []; + if (configured.length > 0 && canRun(configured[0]!, configured.slice(1))) { + return { + command: configured[0]!, + args: configured.slice(1), + source: "configured", + }; + } + + const candidates = + process.platform === "win32" + ? [ + ["py", "-3"], + ["python"], + ["python3"], + ] + : [ + ["python3"], + ["python"], + ]; + + for (const [command, ...args] of candidates) { + if (canRun(command, args)) { + return { + command, + args, + source: "system", + }; + } + } + + throw new Error( + [ + "Python 실행 파일을 찾지 못했습니다.", + "1. Python 3.11 이상을 설치", + "2. 필요하면 `.env` 에 `LOCAL_AI_PYTHON=python` 또는 `LOCAL_AI_PYTHON=py -3` 설정", + "3. 그 다음 `bun run setup:local-ai` 실행", + ].join("\n"), + ); +} diff --git a/src/services/elevenlabs-stt.ts b/src/services/elevenlabs-stt.ts deleted file mode 100644 index 67b7979..0000000 --- a/src/services/elevenlabs-stt.ts +++ /dev/null @@ -1,124 +0,0 @@ -import WebSocket from "ws"; - -import type { AssistantRuntimeConfig } from "../config.js"; - -interface ElevenLabsMessage { - message_type?: string; - text?: string; - error?: string; -} - -const NON_FATAL_ERROR_TYPES = new Set([ - "insufficient_audio_activity", -]); - -export class ElevenLabsSttService { - constructor(private readonly config: AssistantRuntimeConfig) {} - - async transcribePcm16(pcm16MonoAudio: Buffer): Promise { - if (pcm16MonoAudio.byteLength === 0) { - return null; - } - - const url = new URL("wss://api.elevenlabs.io/v1/speech-to-text/realtime"); - url.searchParams.set("model_id", this.config.ELEVENLABS_STT_MODEL); - url.searchParams.set("language_code", this.config.BOT_DEFAULT_LANGUAGE); - url.searchParams.set("audio_format", "pcm_16000"); - url.searchParams.set("commit_strategy", "manual"); - url.searchParams.set("include_timestamps", "false"); - url.searchParams.set("include_language_detection", "false"); - url.searchParams.set("enable_logging", "false"); - - return await new Promise((resolve, reject) => { - const socket = new WebSocket(url, { - headers: { - "xi-api-key": this.config.ELEVENLABS_API_KEY, - }, - }); - - let settled = false; - let lastTranscript = ""; - - const timeout = setTimeout(() => { - finish(lastTranscript || null); - }, 15_000); - - const finish = (result: string | null, error?: Error) => { - if (settled) { - return; - } - settled = true; - clearTimeout(timeout); - try { - socket.close(); - } catch { - // Ignore close race. - } - - if (error) { - reject(error); - return; - } - resolve(result); - }; - - socket.on("message", (raw) => { - let message: ElevenLabsMessage; - try { - message = JSON.parse(raw.toString()) as ElevenLabsMessage; - } catch (error) { - finish(null, error as Error); - return; - } - - switch (message.message_type) { - case "session_started": - socket.send( - JSON.stringify({ - message_type: "input_audio_chunk", - audio_base_64: pcm16MonoAudio.toString("base64"), - commit: true, - sample_rate: 16000, - }), - ); - return; - case "partial_transcript": - return; - case "committed_transcript": - case "committed_transcript_with_timestamps": { - const transcript = message.text?.trim() ?? ""; - if (transcript.length > 0) { - lastTranscript = transcript; - finish(transcript); - } - return; - } - default: - if (!message.message_type?.endsWith("error") && !message.message_type) { - return; - } - - if (message.message_type && NON_FATAL_ERROR_TYPES.has(message.message_type)) { - finish(null); - return; - } - - finish( - null, - new Error(message.error ?? `ElevenLabs STT error: ${message.message_type ?? "unknown"}`), - ); - } - }); - - socket.on("error", (error) => { - finish(null, error as Error); - }); - - socket.on("close", () => { - if (!settled) { - finish(lastTranscript || null); - } - }); - }); - } -} diff --git a/src/services/elevenlabs-tts.ts b/src/services/elevenlabs-tts.ts deleted file mode 100644 index 2b38975..0000000 --- a/src/services/elevenlabs-tts.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { Readable } from "node:stream"; - -import prism from "prism-media"; - -import type { AssistantRuntimeConfig } from "../config.js"; -import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; - -export interface PreparedSpeechAudio { - stream: Readable; - dispose: () => void; -} - -export class ElevenLabsTtsService { - constructor(private readonly config: AssistantRuntimeConfig) { - const resolvedFfmpegPath = resolveFfmpegPath(); - if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { - process.env.FFMPEG_PATH = resolvedFfmpegPath; - } - } - - async preparePlayback(text: string, signal?: AbortSignal): Promise { - const url = new URL(`https://api.elevenlabs.io/v1/text-to-speech/${this.config.ELEVENLABS_VOICE_ID}/stream`); - url.searchParams.set("output_format", "mp3_44100_128"); - url.searchParams.set("enable_logging", "false"); - - const response = await fetch(url, { - method: "POST", - headers: { - "Content-Type": "application/json", - "xi-api-key": this.config.ELEVENLABS_API_KEY, - }, - body: JSON.stringify({ - text, - model_id: this.config.ELEVENLABS_TTS_MODEL, - language_code: this.config.BOT_DEFAULT_LANGUAGE, - voice_settings: { - stability: 0.35, - similarity_boost: 0.75, - speed: 1.05, - }, - }), - signal, - }); - - if (!response.ok || !response.body) { - throw new Error(`ElevenLabs TTS request failed with status ${response.status}`); - } - - const input = Readable.fromWeb(response.body as never); - const ffmpeg = new prism.FFmpeg({ - args: [ - "-analyzeduration", - "0", - "-loglevel", - "0", - "-i", - "pipe:0", - "-f", - "s16le", - "-ar", - "48000", - "-ac", - "2", - "pipe:1", - ], - }); - - input.pipe(ffmpeg); - - return { - stream: ffmpeg, - dispose: () => { - input.destroy(); - ffmpeg.destroy(); - }, - }; - } -} diff --git a/src/services/local-stt.ts b/src/services/local-stt.ts new file mode 100644 index 0000000..60c3339 --- /dev/null +++ b/src/services/local-stt.ts @@ -0,0 +1,43 @@ +import type { AssistantRuntimeConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { PythonJsonWorker } from "./python-json-worker.js"; +import type { SttService } from "./stt.js"; + +interface TranscribeResult { + text?: string; +} + +export class LocalFasterWhisperSttService implements SttService { + private readonly worker: PythonJsonWorker; + + constructor(private readonly config: AssistantRuntimeConfig, logger: Logger) { + this.worker = new PythonJsonWorker(config, logger, "local_stt_worker.py", "local-stt", { + LOCAL_STT_MODEL: config.LOCAL_STT_MODEL, + LOCAL_STT_DEVICE: config.LOCAL_STT_DEVICE, + LOCAL_STT_COMPUTE_TYPE: config.LOCAL_STT_COMPUTE_TYPE, + LOCAL_STT_BEAM_SIZE: String(config.LOCAL_STT_BEAM_SIZE), + }); + } + + async warmup(): Promise { + await this.worker.request("ping", {}); + } + + async transcribePcm16(pcm16MonoAudio: Buffer): Promise { + if (pcm16MonoAudio.byteLength === 0) { + return null; + } + + const result = await this.worker.request("transcribe", { + audio_base64: pcm16MonoAudio.toString("base64"), + language: this.config.BOT_DEFAULT_LANGUAGE, + }); + + const transcript = result.text?.trim() ?? ""; + return transcript.length > 0 ? transcript : null; + } + + async destroy(): Promise { + await this.worker.destroy(); + } +} diff --git a/src/services/local-tts.ts b/src/services/local-tts.ts new file mode 100644 index 0000000..3191c09 --- /dev/null +++ b/src/services/local-tts.ts @@ -0,0 +1,94 @@ +import { Readable } from "node:stream"; + +import prism from "prism-media"; + +import type { AssistantRuntimeConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { resolveFfmpegPath } from "../audio/ffmpeg-path.js"; +import { PythonJsonWorker } from "./python-json-worker.js"; +import type { PreparedSpeechAudio, TtsService } from "./tts.js"; + +interface SynthesizeResult { + wav_base64?: string; +} + +export class LocalMeloTtsService implements TtsService { + private readonly worker: PythonJsonWorker; + + constructor(config: AssistantRuntimeConfig, logger: Logger) { + const resolvedFfmpegPath = resolveFfmpegPath(); + if (resolvedFfmpegPath && !process.env.FFMPEG_PATH) { + process.env.FFMPEG_PATH = resolvedFfmpegPath; + } + + this.worker = new PythonJsonWorker(config, logger, "local_tts_worker.py", "local-tts", { + LOCAL_TTS_LANGUAGE: config.LOCAL_TTS_LANGUAGE, + LOCAL_TTS_SPEAKER: config.LOCAL_TTS_SPEAKER, + LOCAL_TTS_DEVICE: config.LOCAL_TTS_DEVICE, + LOCAL_TTS_SPEED: String(config.LOCAL_TTS_SPEED), + }); + } + + async warmup(): Promise { + await this.worker.request("ping", {}); + } + + async preparePlayback(text: string, signal?: AbortSignal): Promise { + const result = await this.worker.request( + "synthesize", + { + text, + }, + signal, + ); + + const wavBase64 = result.wav_base64; + if (!wavBase64) { + throw new Error("로컬 TTS가 빈 오디오를 반환했습니다."); + } + + const input = Readable.from([Buffer.from(wavBase64, "base64")]); + const ffmpeg = new prism.FFmpeg({ + args: [ + "-analyzeduration", + "0", + "-loglevel", + "0", + "-i", + "pipe:0", + "-f", + "s16le", + "-ar", + "48000", + "-ac", + "2", + "pipe:1", + ], + }); + + if (signal) { + signal.addEventListener( + "abort", + () => { + input.destroy(); + ffmpeg.destroy(); + }, + { once: true }, + ); + } + + input.pipe(ffmpeg); + + return { + stream: ffmpeg, + dispose: () => { + input.destroy(); + ffmpeg.destroy(); + }, + }; + } + + async destroy(): Promise { + await this.worker.destroy(); + } +} diff --git a/src/services/python-json-worker.ts b/src/services/python-json-worker.ts new file mode 100644 index 0000000..f48f304 --- /dev/null +++ b/src/services/python-json-worker.ts @@ -0,0 +1,189 @@ +import { spawn, type ChildProcessWithoutNullStreams } from "node:child_process"; +import { createInterface } from "node:readline"; +import path from "node:path"; + +import type { AssistantRuntimeConfig } from "../config.js"; +import type { Logger } from "../logger.js"; +import { resolveLocalAiCachePath, resolvePythonLaunch } from "../python-runtime.js"; + +interface WorkerRequest { + id: number; + method: string; + params: Record; +} + +interface WorkerResponse { + id: number; + ok: boolean; + result?: unknown; + error?: string; +} + +export class PythonJsonWorker { + private child: ChildProcessWithoutNullStreams | null = null; + private nextId = 1; + private readonly pending = new Map< + number, + { + resolve: (value: unknown) => void; + reject: (error: Error) => void; + } + >(); + + constructor( + private readonly config: AssistantRuntimeConfig, + private readonly logger: Logger, + private readonly scriptName: string, + private readonly label: string, + private readonly workerEnv: Record, + ) {} + + async request(method: string, params: Record, signal?: AbortSignal): Promise { + const child = this.ensureStarted(); + const id = this.nextId++; + + return await new Promise((resolve, reject) => { + if (signal?.aborted) { + reject(new Error(`${this.label} request aborted before start`)); + return; + } + + const abortHandler = () => { + this.pending.delete(id); + reject(new Error(`${this.label} request aborted`)); + }; + + if (signal) { + signal.addEventListener("abort", abortHandler, { once: true }); + } + + this.pending.set(id, { + resolve: (value) => { + if (signal) { + signal.removeEventListener("abort", abortHandler); + } + resolve(value as T); + }, + reject: (error) => { + if (signal) { + signal.removeEventListener("abort", abortHandler); + } + reject(error); + }, + }); + + const message: WorkerRequest = { + id, + method, + params, + }; + + child.stdin.write(`${JSON.stringify(message)}\n`); + }); + } + + async destroy(): Promise { + this.rejectAll(new Error(`${this.label} worker terminated`)); + + if (!this.child) { + return; + } + + const child = this.child; + this.child = null; + + child.kill("SIGTERM"); + await new Promise((resolve) => { + child.once("exit", () => resolve()); + setTimeout(resolve, 1_500); + }); + } + + private ensureStarted(): ChildProcessWithoutNullStreams { + if (this.child) { + return this.child; + } + + const launch = resolvePythonLaunch(this.config); + const scriptPath = path.resolve(process.cwd(), "python", this.scriptName); + const cachePath = resolveLocalAiCachePath(this.config); + const recentStderr: string[] = []; + + const child = spawn(launch.command, [...launch.args, scriptPath], { + stdio: ["pipe", "pipe", "pipe"], + env: { + ...process.env, + HF_HOME: cachePath, + TRANSFORMERS_CACHE: cachePath, + PYTHONIOENCODING: "utf-8", + BOT_DEFAULT_LANGUAGE: this.config.BOT_DEFAULT_LANGUAGE, + ...this.workerEnv, + }, + }); + + createInterface({ + input: child.stdout, + crlfDelay: Number.POSITIVE_INFINITY, + }).on("line", (line) => { + if (!line.trim()) { + return; + } + + let payload: WorkerResponse; + try { + payload = JSON.parse(line) as WorkerResponse; + } catch (error) { + this.logger.warn(`${this.label} stdout parse failed`, error); + return; + } + + const pending = this.pending.get(payload.id); + if (!pending) { + return; + } + + this.pending.delete(payload.id); + if (payload.ok) { + pending.resolve(payload.result); + return; + } + + pending.reject(new Error(payload.error ?? `${this.label} worker error`)); + }); + + child.stderr.on("data", (chunk: Buffer) => { + const text = chunk.toString().trim(); + if (text.length > 0) { + recentStderr.push(text); + if (recentStderr.length > 20) { + recentStderr.shift(); + } + this.logger.warn(`[${this.label}]`, text); + } + }); + + child.on("exit", (code, signal) => { + if (this.child === child) { + this.child = null; + } + + const detail = recentStderr.length > 0 ? `\n${recentStderr.join("\n")}` : ""; + this.rejectAll(new Error(`${this.label} worker exited code=${code ?? "null"} signal=${signal ?? "null"}${detail}`)); + }); + + child.on("error", (error) => { + this.rejectAll(error as Error); + }); + + this.child = child; + return child; + } + + private rejectAll(error: Error): void { + const pending = [...this.pending.values()]; + this.pending.clear(); + for (const item of pending) { + item.reject(error); + } + } +} diff --git a/src/services/stt.ts b/src/services/stt.ts new file mode 100644 index 0000000..393aeb8 --- /dev/null +++ b/src/services/stt.ts @@ -0,0 +1,4 @@ +export interface SttService { + transcribePcm16(pcm16MonoAudio: Buffer): Promise; + destroy?(): Promise; +} diff --git a/src/services/tts.ts b/src/services/tts.ts new file mode 100644 index 0000000..556b3a6 --- /dev/null +++ b/src/services/tts.ts @@ -0,0 +1,11 @@ +import type { Readable } from "node:stream"; + +export interface PreparedSpeechAudio { + stream: Readable; + dispose: () => void; +} + +export interface TtsService { + preparePlayback(text: string, signal?: AbortSignal): Promise; + destroy?(): Promise; +} diff --git a/src/setup-local-ai.ts b/src/setup-local-ai.ts new file mode 100644 index 0000000..229ac92 --- /dev/null +++ b/src/setup-local-ai.ts @@ -0,0 +1,88 @@ +import { existsSync } from "node:fs"; +import { mkdir } from "node:fs/promises"; +import { spawn } from "node:child_process"; +import path from "node:path"; + +import { loadConfig } from "./config.js"; +import { resolveLocalAiCachePath, resolveLocalAiVenvPath, resolvePythonLaunch, resolveVenvPythonPath } from "./python-runtime.js"; + +async function run(command: string, args: string[], extraEnv?: NodeJS.ProcessEnv): Promise { + await new Promise((resolve, reject) => { + const child = spawn(command, args, { + stdio: "inherit", + env: { + ...process.env, + ...extraEnv, + }, + }); + + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error(`${command} ${args.join(" ")} exited with code ${code ?? "null"}`)); + }); + child.on("error", reject); + }); +} + +async function ensurePip(pythonBin: string, env: NodeJS.ProcessEnv): Promise { + await new Promise((resolve, reject) => { + const child = spawn(pythonBin, ["-m", "pip", "--version"], { + stdio: "ignore", + env, + }); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + return; + } + reject(new Error("pip missing")); + }); + child.on("error", reject); + }).catch(async () => { + await run(pythonBin, ["-m", "ensurepip", "--upgrade"], env); + }); +} + +async function main(): Promise { + const config = loadConfig(); + const venvPath = resolveLocalAiVenvPath(config); + const venvPython = resolveVenvPythonPath(config); + const cachePath = resolveLocalAiCachePath(config); + const requirementsPath = path.resolve(process.cwd(), "python", "requirements.txt"); + const baseEnv = { + HF_HOME: cachePath, + TRANSFORMERS_CACHE: cachePath, + PYTHONIOENCODING: "utf-8", + }; + + await mkdir(cachePath, { recursive: true }); + + if (!existsSync(venvPython)) { + const launch = resolvePythonLaunch(config, { preferVenv: false }); + console.log(`기본 Python 확인: ${launch.command} ${launch.args.join(" ")}`.trim()); + console.log(`가상환경 생성: ${venvPath}`); + await run(launch.command, [...launch.args, "-m", "venv", venvPath], baseEnv); + } + + await ensurePip(venvPython, { + ...process.env, + ...baseEnv, + }); + + console.log("로컬 AI 의존성 설치를 시작합니다."); + await run(venvPython, ["-m", "pip", "install", "--upgrade", "pip", "setuptools", "wheel"], baseEnv); + await run(venvPython, ["-m", "pip", "install", "-r", requirementsPath], baseEnv); + + console.log("설치가 끝났습니다."); + console.log("다음 순서:"); + console.log("1. bun run devices"); + console.log("2. bun run start:local"); +} + +void main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exit(1); +});