Enable GPU: LLM + Whisper on the RTX 5050, pick qwen3:8b

GPU acceleration is now on by default and verified end-to-end on the Blackwell RTX 5050 (sm_120): - Ollama offloads 100% to GPU (log: library=CUDA compute=12.0, BLACKWELL_NATIVE_FP4=1). compose passes GPU via CDI (devices: nvidia.com/gpu=all) to both ollama and javis. - Whisper STT on GPU: faster-whisper>=1.1.0 + nvidia-cublas/cudnn cu12, LD_LIBRARY_PATH baked into the image. Verified float16 transcribe on sm_120; bridge auto-falls back to CPU when no GPU is present. - Model: default chat model -> qwen3:8b (best 8GB-VRAM tool-calling, ~5GB Q4). Embed stays nomic-embed-text. - README documents the host one-time setup (nvidia-container-toolkit + `nvidia-ctk cdi generate`) and GPU on/off. Verified: image builds; GPU visible in both containers via compose; ollama ps = 100% GPU; faster-whisper cuda OK + CPU fallback OK; bridge /health 200.
2026-06-09 15:49:21 +09:00
parent 25c77ac794
commit 0dbc0300d7
7 changed files with 61 additions and 22 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -19,14 +19,10 @@ services:
    restart: unless-stopped
    volumes:
      - ollama_models:/root/.ollama
-    # --- GPU (optional): needs nvidia-container-toolkit on the host ---
-    # deploy:
-    #   resources:
-    #     reservations:
-    #       devices:
-    #         - driver: nvidia
-    #           count: all
-    #           capabilities: [gpu]
+    # GPU: needs nvidia-container-toolkit on the host (CDI). Verified on the
+    # RTX 5050 (Blackwell sm_120) — Ollama offloads 100% to GPU.
+    devices:
+      - "nvidia.com/gpu=all"

  # Auto-pull the models the brain needs, then exit. Idempotent (re-runnable).
  ollama-init:
@@ -36,7 +32,7 @@ services:
    restart: "no"
    environment:
      OLLAMA_HOST: http://ollama:11434
-      CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-llama3.1:8b}
+      CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-qwen3:8b}
      EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-nomic-embed-text}
    entrypoint: ["/bin/sh", "-c"]
    command:
@@ -58,12 +54,18 @@ services:
    environment:
      # Point the brain at the ollama service and the bot at the in-container bridge.
      OLLAMA_BASE_URL: http://ollama:11434
-      OLLAMA_CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-llama3.1:8b}
+      OLLAMA_CHAT_MODEL: ${OLLAMA_CHAT_MODEL:-qwen3:8b}
      OLLAMA_EMBED_MODEL: ${OLLAMA_EMBED_MODEL:-nomic-embed-text}
      WHISPER_MODEL: ${WHISPER_MODEL:-small}
+      WHISPER_DEVICE: ${WHISPER_DEVICE:-cuda}
+      WHISPER_COMPUTE_TYPE: ${WHISPER_COMPUTE_TYPE:-float16}
      BRIDGE_URL: http://127.0.0.1:8765
    depends_on:
      - ollama
+    # GPU: accelerates Whisper STT (and anything else CUDA) in this container.
+    # Verified: faster-whisper float16 works on the RTX 5050 (sm_120).
+    devices:
+      - "nvidia.com/gpu=all"
    shm_size: "1gb"          # Chrome needs a larger /dev/shm
    ports:
      # Host ports are overridable. If the HOST already runs VNC on 5901
@@ -75,7 +77,6 @@ services:
      - javis_data:/data                         # jarvis db + memory
      - whisper_cache:/root/.cache/huggingface   # cached Whisper models
      - piper_voices:/opt/piper-voices           # TTS voices
-    # --- GPU (optional): mirror the ollama GPU block above to accelerate Whisper ---

 volumes:
  ollama_models: