diff --git a/backend/app/models/chronos.py b/backend/app/models/chronos.py
index 328dacc..5ae1621 100644
--- a/backend/app/models/chronos.py
+++ b/backend/app/models/chronos.py
@@ -84,6 +84,26 @@ def _load() -> None:
         _state.update({"loaded": True, "pipe": pipe, "device": device})
 
 
+def _reload_cpu() -> None:
+    """현재 pipeline 을 폐기하고 CPU 로 강제 재로드.
+
+    cuda 환경에서 'no kernel image is available for execution on the device' 같이
+    런타임에야 드러나는 GPU 비호환 에러가 났을 때 자동 폴백용. 한 번 폴백하면
+    다음 호출부터는 CPU 그대로 사용 (재시도 비용 회피)."""
+    global _state
+    import torch
+    from chronos import ChronosPipeline
+    with _lock:
+        logger.warning("falling back to CPU for Chronos (GPU inference failed)")
+        _state.update({"loaded": False, "pipe": None, "device": None})
+        pipe = ChronosPipeline.from_pretrained(
+            MODEL_NAME,
+            device_map="cpu",
+            torch_dtype=torch.float32,
+        )
+        _state.update({"loaded": True, "pipe": pipe, "device": "cpu"})
+
+
 def forecast(
     series: list[float],
     *,
@@ -102,14 +122,31 @@ def forecast(
     import numpy as np
     import torch
 
-    pipe = _state["pipe"]
-    context = torch.tensor([float(x) for x in series], dtype=torch.float32)
-    with torch.no_grad():
-        samples = pipe.predict(
-            context=context,
-            prediction_length=horizon,
-            num_samples=num_samples,
-        )
+    def _do_predict():
+        pipe = _state["pipe"]
+        context = torch.tensor([float(x) for x in series], dtype=torch.float32)
+        with torch.no_grad():
+            return pipe.predict(
+                context=context,
+                prediction_length=horizon,
+                num_samples=num_samples,
+            )
+
+    try:
+        samples = _do_predict()
+    except RuntimeError as exc:
+        # cuda 빌드/드라이버 미스매치는 inference 시점에야 드러나는 경우가 많음.
+        # 'no kernel image is available' / 'CUDA error' 같은 신호 잡아서 CPU 로 폴백.
+        msg = str(exc)
+        if _state.get("device") == "cuda" and (
+            "no kernel image" in msg
+            or "CUDA error" in msg
+            or "CUBLAS" in msg
+        ):
+            _reload_cpu()
+            samples = _do_predict()
+        else:
+            raise
     # samples: (1, num_samples, prediction_length)
     arr = samples[0].cpu().float().numpy()
     q10 = np.quantile(arr, 0.10, axis=0).tolist()