From 0dbc0300d77cfe1ae5d7c71844224a4b4901f7bb Mon Sep 17 00:00:00 2001 From: javis-bot Date: Tue, 9 Jun 2026 15:49:21 +0900 Subject: [PATCH] Enable GPU: LLM + Whisper on the RTX 5050, pick qwen3:8b GPU acceleration is now on by default and verified end-to-end on the Blackwell RTX 5050 (sm_120): - Ollama offloads 100% to GPU (log: library=CUDA compute=12.0, BLACKWELL_NATIVE_FP4=1). compose passes GPU via CDI (devices: nvidia.com/gpu=all) to both ollama and javis. - Whisper STT on GPU: faster-whisper>=1.1.0 + nvidia-cublas/cudnn cu12, LD_LIBRARY_PATH baked into the image. Verified float16 transcribe on sm_120; bridge auto-falls back to CPU when no GPU is present. - Model: default chat model -> qwen3:8b (best 8GB-VRAM tool-calling, ~5GB Q4). Embed stays nomic-embed-text. - README documents the host one-time setup (nvidia-container-toolkit + `nvidia-ctk cdi generate`) and GPU on/off. Verified: image builds; GPU visible in both containers via compose; ollama ps = 100% GPU; faster-whisper cuda OK + CPU fallback OK; bridge /health 200. --- .env.example | 10 ++++++---- Dockerfile | 5 ++++- README.md | 19 ++++++++++++++++++- bridge/requirements-bridge.txt | 9 ++++++++- bridge/server.py | 11 ++++++++++- docker-compose.yml | 23 ++++++++++++----------- docker/entrypoint.sh | 6 +++--- 7 files changed, 61 insertions(+), 22 deletions(-) diff --git a/.env.example b/.env.example index 15a03b5..dd498af 100644 --- a/.env.example +++ b/.env.example @@ -20,9 +20,10 @@ BRIDGE_HOST=127.0.0.1 BRIDGE_PORT=8765 JARVIS_BRAIN_ENABLED=1 JARVIS_TTS_ENABLED=1 -# faster-whisper device/compute. On this RTX 5050 box: cuda / float16. -WHISPER_DEVICE=auto -WHISPER_COMPUTE_TYPE=auto +# faster-whisper device/compute. GPU by default (RTX 5050 / sm_120, verified). +# Falls back to CPU automatically if no GPU is passed to the container. +WHISPER_DEVICE=cuda +WHISPER_COMPUTE_TYPE=float16 # Optional explicit Piper voice model (.onnx). If empty, the jarvis default is used. TTS_PIPER_MODEL_PATH= @@ -32,7 +33,8 @@ TTS_PIPER_MODEL_PATH= # --------------------------------------------------------------------------- # In docker-compose this is overridden to http://ollama:11434 automatically. OLLAMA_BASE_URL=http://127.0.0.1:11434 -OLLAMA_CHAT_MODEL=llama3.1:8b +# qwen3:8b — best 8GB-VRAM pick: strongest tool-calling, ~5GB Q4, fits the RTX 5050. +OLLAMA_CHAT_MODEL=qwen3:8b OLLAMA_EMBED_MODEL=nomic-embed-text WHISPER_MODEL=small diff --git a/Dockerfile b/Dockerfile index 0e28fbf..93130c2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,10 @@ FROM ubuntu:24.04 ENV DEBIAN_FRONTEND=noninteractive \ LANG=C.UTF-8 \ DISPLAY=:1 \ - PATH=/opt/venv/bin:/root/.bun/bin:/usr/local/bin:/usr/bin:/bin + PATH=/opt/venv/bin:/root/.bun/bin:/usr/local/bin:/usr/bin:/bin \ + NVIDIA_VISIBLE_DEVICES=all \ + NVIDIA_DRIVER_CAPABILITIES=compute,utility \ + LD_LIBRARY_PATH=/opt/venv/lib/python3.12/site-packages/nvidia/cublas/lib:/opt/venv/lib/python3.12/site-packages/nvidia/cudnn/lib # --- System packages: desktop, VNC, Chrome deps, ffmpeg, python, ocr --- RUN apt-get update && apt-get install -y --no-install-recommends \ diff --git a/README.md b/README.md index e237d80..50ab08b 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,24 @@ docker compose up -d # 봇이 시작되고 /자비스 명령 등록 디스코드에서 `/자비스 join` 으로 호출하세요. (`OLLAMA_CHAT_MODEL` 등 모델을 바꾸려면 `.env`에서 지정 후 `docker compose up -d`.) -- GPU(RTX 5050) 가속: 호스트에 nvidia-container-toolkit 설치 후 `docker-compose.yml`의 GPU 블록 주석 해제, `.env`에서 `WHISPER_DEVICE=cuda` / `WHISPER_COMPUTE_TYPE=float16`. +### GPU 가속 (기본 ON) + +LLM(Ollama)과 Whisper STT가 **기본적으로 GPU(RTX 5050, Blackwell sm_120)** 에서 돕니다. 검증 완료: Ollama 100% GPU 오프로드, faster-whisper float16 GPU 동작. + +호스트 사전 준비(1회): + +```bash +# nvidia-container-toolkit 설치 후 CDI 스펙 생성 (Docker 29 CDI 방식, 데몬 재시작 불필요) +sudo nvidia-ctk cdi generate --output=/etc/cdi/nvidia.yaml +docker run --rm --device nvidia.com/gpu=all ubuntu nvidia-smi -L # GPU 보이면 OK +``` + +`docker-compose.yml`은 두 컨테이너에 `devices: ["nvidia.com/gpu=all"]`(CDI)로 GPU를 넣습니다. + +- 모델: 기본 `qwen3:8b` — 8GB VRAM에서 도구호출(tool calling)이 가장 안정적이고 ~5GB(Q4)로 잘 맞습니다. 더 가볍게/무겁게 쓰려면 `.env`의 `OLLAMA_CHAT_MODEL` 변경. +- Whisper는 `WHISPER_DEVICE=cuda`/`float16` 기본. **GPU가 없으면 자동으로 CPU로 폴백**하므로 안전합니다. +- GPU가 아예 없는 호스트라면 `docker-compose.yml`의 두 `devices:` 블록을 지우고 `.env`에 `WHISPER_DEVICE=cpu`를 두면 됩니다. + - 데이터(메모리 DB), Whisper 캐시, Piper 음성은 named volume에 영속됩니다. - 셀프봇 영상 송출 의존성은 이미지에 기본 포함하지 않습니다. 쓰려면 컨테이너에서 `cd /app/bot && bun add discord.js-selfbot-v13 @dank074/discord-video-stream` 후 재시작(또는 Dockerfile에 추가). diff --git a/bridge/requirements-bridge.txt b/bridge/requirements-bridge.txt index e3d8ee2..a0f9531 100644 --- a/bridge/requirements-bridge.txt +++ b/bridge/requirements-bridge.txt @@ -5,12 +5,19 @@ # --- Brain runtime (imported when the reply engine loads) --- python-dotenv==1.0.1 -faster-whisper==1.0.3 +# >=1.1.0 pulls a ctranslate2 with Blackwell (sm_120) CUDA kernels. +faster-whisper>=1.1.0 mcp==1.13.1 numpy<2.0.0 rapidfuzz==3.6.1 requests==2.32.3 +# --- CUDA libraries for GPU-accelerated Whisper (RTX 5050 / sm_120) --- +# ctranslate2 dlopens these at transcribe time; LD_LIBRARY_PATH is set in the +# Dockerfile to point at them. Verified working on Blackwell sm_120. +nvidia-cublas-cu12 +nvidia-cudnn-cu12 + # --- Bridge HTTP service --- flask>=3.0.0 diff --git a/bridge/server.py b/bridge/server.py index 73d791e..5f46c0d 100644 --- a/bridge/server.py +++ b/bridge/server.py @@ -90,7 +90,16 @@ def _ensure_brain(): ) device = os.environ.get("WHISPER_DEVICE", "auto") compute = os.environ.get("WHISPER_COMPUTE_TYPE", "auto") - whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute) + try: + whisper = WhisperModel(cfg.whisper_model, device=device, compute_type=compute) + except Exception as ge: + # GPU not available / unsupported -> fall back to CPU so the + # bridge still works without a GPU passed to the container. + if device != "cpu": + print(f"[bridge] whisper device='{device}' failed ({ge}); falling back to CPU", flush=True) + whisper = WhisperModel(cfg.whisper_model, device="cpu", compute_type="int8") + else: + raise _cfg, _db, _dialogue_memory, _whisper = cfg, db, dialogue_memory, whisper print(f"[bridge] brain ready (chat={cfg.ollama_chat_model}, whisper={cfg.whisper_model})", flush=True) diff --git a/docker-compose.yml b/docker-compose.yml index fae41d8..224d8ce 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,14 +19,10 @@ services: restart: unless-stopped volumes: - ollama_models:/root/.ollama - # --- GPU (optional): needs nvidia-container-toolkit on the host --- - # deploy: - # resources: - # reservations: - # devices: - # - driver: nvidia - # count: all - # capabilities: [gpu] + # GPU: needs nvidia-container-toolkit on the host (CDI). Verified on the + # RTX 5050 (Blackwell sm_120) — Ollama offloads 100% to GPU. + devices: + - "nvidia.com/gpu=all" # Auto-pull the models the brain needs, then exit. Idempotent (re-runnable). ollama-init: @@ -36,7 +32,7 @@ services: restart: "no" environment: OLLAMA_HOST: http://ollama:11434 - CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-llama3.1:8b} + CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-qwen3:8b} EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-nomic-embed-text} entrypoint: ["/bin/sh", "-c"] command: @@ -58,12 +54,18 @@ services: environment: # Point the brain at the ollama service and the bot at the in-container bridge. OLLAMA_BASE_URL: http://ollama:11434 - OLLAMA_CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-llama3.1:8b} + OLLAMA_CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-qwen3:8b} OLLAMA_EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-nomic-embed-text} WHISPER_MODEL: ${WHISPER_MODEL:-small} + WHISPER_DEVICE: ${WHISPER_DEVICE:-cuda} + WHISPER_COMPUTE_TYPE: ${WHISPER_COMPUTE_TYPE:-float16} BRIDGE_URL: http://127.0.0.1:8765 depends_on: - ollama + # GPU: accelerates Whisper STT (and anything else CUDA) in this container. + # Verified: faster-whisper float16 works on the RTX 5050 (sm_120). + devices: + - "nvidia.com/gpu=all" shm_size: "1gb" # Chrome needs a larger /dev/shm ports: # Host ports are overridable. If the HOST already runs VNC on 5901 @@ -75,7 +77,6 @@ services: - javis_data:/data # jarvis db + memory - whisper_cache:/root/.cache/huggingface # cached Whisper models - piper_voices:/opt/piper-voices # TTS voices - # --- GPU (optional): mirror the ollama GPU block above to accelerate Whisper --- volumes: ollama_models: diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index e3099ce..130a913 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -8,11 +8,11 @@ set -euo pipefail : "${VNC_PASSWORD:=javis123}" : "${VNC_RESOLUTION:=1920x1080}" : "${OLLAMA_BASE_URL:=http://ollama:11434}" -: "${OLLAMA_CHAT_MODEL:=llama3.1:8b}" +: "${OLLAMA_CHAT_MODEL:=qwen3:8b}" : "${OLLAMA_EMBED_MODEL:=nomic-embed-text}" : "${WHISPER_MODEL:=small}" -: "${WHISPER_DEVICE:=cpu}" -: "${WHISPER_COMPUTE_TYPE:=int8}" +: "${WHISPER_DEVICE:=cuda}" +: "${WHISPER_COMPUTE_TYPE:=float16}" : "${JARVIS_DB_PATH:=/data/jarvis.db}" : "${BRIDGE_HOST:=0.0.0.0}" : "${BRIDGE_PORT:=8765}"