feat: 예측 실패 원인 노출 + /health/models 진단 + restart-ci.bat

사금향님이 만난 409 'both chronos & lgbm failed' 에러가 원인을 안 보여줘서 디버깅 어려웠음. 세 군데 보강: 1. ensemble.py: 두 모델 다 실패 시 chronos/lgbm 각각의 실제 에러 원문 (type:message) 을 RuntimeError 메시지에 포함. predict.py 가 409 detail 로 그대로 노출하므로 브라우저에서 바로 원인 확인 가능. LGBM 가 None 반환 (체크포인트 없음) 인 경우도 'model checkpoint not found' 로 명시. 2. /health/models 엔드포인트 추가: - chronos.ping() — lazy load 시도 + 디바이스/모델명 반환 - LGBM_MODEL_DIR 의 *.pkl 개수와 샘플 8개 파일명 반환. cold start (체크포인트 0개) 면 'no_checkpoints' 상태로 알림. 3. restart-ci.bat 추가 — restart.bat 에서 pause 빼고 종료 코드로만 알리는 SSH 비대화형 친화 버전. 일반 사용은 그대로 restart.bat.
2026-05-23 15:42:44 +09:00
parent 44873ddb39
commit e0edc8f1e3
3 changed files with 98 additions and 3 deletions
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -132,3 +132,30 @@ def health_keys() -> dict[str, object]:
        "dart": dart_mod.ping(),
        # huggingface 는 모델 다운로드 시점에 확인 (별도 ping 호출 비용 회피)
    }
+
+
+@app.get("/health/models")
+def health_models() -> dict[str, object]:
+    """Chronos / LGBM 가용성 진단.
+
+    Chronos: lazy 로드 첫 호출이라 30초~수 분 걸릴 수 있음 (HuggingFace 다운로드).
+    LGBM: 체크포인트 디렉토리 스캔 — retrain 안 돈 cold start 에선 비어있음.
+    """
+    from pathlib import Path
+
+    from app.models import chronos as chronos_mod
+
+    lgbm_dir = Path(os.environ.get("LGBM_MODEL_DIR", "/app/data/models"))
+    lgbm_files: list[str] = []
+    if lgbm_dir.exists():
+        lgbm_files = sorted(p.name for p in lgbm_dir.glob("*.pkl"))
+
+    return {
+        "chronos": chronos_mod.ping(),
+        "lgbm": {
+            "model_dir": str(lgbm_dir),
+            "checkpoint_count": len(lgbm_files),
+            "samples": lgbm_files[:8],  # 너무 많으면 잘라서.
+            "status": "ok" if lgbm_files else "no_checkpoints (cold start, run retrain_weekly)",
+        },
+    }
--- a/backend/app/models/ensemble.py
+++ b/backend/app/models/ensemble.py
@@ -87,30 +87,41 @@ def predict(code: str, *, horizons: tuple[int, ...] = (1, 3, 5)) -> EnsemblePred

    sources_used: list[str] = []
    cf: ChronosForecast | None = None
+    chronos_err: str | None = None
    try:
        cf = chronos_forecast(closes, horizon=max_h, num_samples=30)
        sources_used.append("chronos")
    except Exception as exc:  # noqa: BLE001
-        logger.warning("chronos forecast failed for %s: %s", code, exc)
+        chronos_err = f"{type(exc).__name__}: {exc}"
+        logger.warning("chronos forecast failed for %s: %s", code, chronos_err)

    steps: list[EnsembleStep] = []
    lgbm_raw: dict[int, LgbmForecast] = {}
    for h in horizons:
        lf: LgbmForecast | None = None
+        lgbm_err: str | None = None
        try:
            lf = lgbm_predict(code, h)
            if lf is not None:
                sources_used.append(f"lgbm_h{h}")
                lgbm_raw[h] = lf
+            else:
+                # predict_one 이 None 반환 = 체크포인트 파일 없음 (cold start).
+                lgbm_err = "model checkpoint not found (run retrain_weekly)"
        except Exception as exc:  # noqa: BLE001
-            logger.warning("lgbm predict failed for %s h=%d: %s", code, h, exc)
+            lgbm_err = f"{type(exc).__name__}: {exc}"
+            logger.warning("lgbm predict failed for %s h=%d: %s", code, h, lgbm_err)

        # 가중치 (DB 없으면 default 0.6/0.4).
        w = load_weights(code, h)
        wc, wl = w.w_chronos, w.w_lgbm
        # 한쪽이 없으면 다른 쪽 전부.
        if cf is None and lf is None:
-            raise RuntimeError(f"both chronos & lgbm failed for {code} h={h}")
+            # 사용자가 브라우저에서 바로 원인을 보게 두 에러를 그대로 노출.
+            raise RuntimeError(
+                f"both chronos & lgbm failed for {code} h={h}; "
+                f"chronos={chronos_err or 'unknown'}; lgbm={lgbm_err or 'unknown'}"
+            )
        if cf is None:
            wc, wl = 0.0, 1.0
        if lf is None: