feat(phase-2): KR-FinBERT 감성 스코어링 + 일별 집계 뷰

- backend/app/nlp/finbert.py: snunlp/KR-FinBert-SC 어댑터. - score = P(pos) - P(neg) ∈ [-1, +1], label = argmax (neg/neu/pos) - 768d mean-pooled last hidden state → news.embedding (VECTOR) 저장 - settings.huggingface_token 인증, lazy singleton, cuda/cpu auto - backend/app/nlp/score_news.py: news 테이블에서 sentiment_score IS NULL 행을 배치 스코어 → UPDATE (... embedding=(:e)::vector). 종목 필터 + limit 옵션. - backend/app/db/migrations/002_sentiment_view.sql: v_sentiment_daily 뷰. 종목·KST 일별 n_articles, mean_score, pos/neg/neu_ratio, weighted_score (naver_finance 1.0 / google_rss 0.7 / dart 0.5). - backend/app/db/migrate.py: 이미 실행 중인 DB 에 새 SQL 마이그레이션 적용용 CLI. 모든 SQL 파일은 idempotent. - refresh_one.py: refresh 끝에 종목당 200건까지 finbert 스코어, finbert SourceStatus 를 RefreshReport 에 추가. - daily_batch.py: 모든 종목 처리 후 score_pending_news(limit=2000) 로 mop-up. 모델 캐시는 docker-compose hf_cache 볼륨(/root/.cache/huggingface). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-20 15:57:34 +09:00
parent 239b104a2b
commit edda01adbf
7 changed files with 369 additions and 0 deletions
--- a/backend/app/pipelines/daily_batch.py
+++ b/backend/app/pipelines/daily_batch.py
@@ -11,6 +11,7 @@ import time
 from typing import Any

 from app.fetch import macro as macro_mod
+from app.nlp.score_news import score_pending_news
 from app.pipelines.refresh_one import refresh_code
 from app.seed.seed_tickers import SEED_TICKERS

@@ -32,11 +33,26 @@ def run_daily_batch() -> dict[str, Any]:
        for m in macros
    ]

+    # 시드 종목 refresh 내에서 종목당 200건만 스코어함. 잔여(여러 소스 합쳐
+    # 200건 초과 또는 코드 매핑 안된 google_rss 등)는 여기서 한 번에 mop-up.
+    try:
+        mop = score_pending_news(limit=2000)
+        sentiment_summary: dict[str, Any] = {
+            "status": "ok" if mop.error is None else "failed",
+            "fetched": mop.fetched,
+            "scored": mop.scored,
+            "failed": mop.failed,
+            "error": mop.error,
+        }
+    except Exception as exc:  # noqa: BLE001
+        sentiment_summary = {"status": "failed", "error": str(exc)}
+
    elapsed = time.time() - start_ts
    return {
        "duration_seconds": round(elapsed, 2),
        "tickers": reports,
        "macro": macro_summary,
+        "sentiment_mop": sentiment_summary,
    }


--- a/backend/app/pipelines/refresh_one.py
+++ b/backend/app/pipelines/refresh_one.py
@@ -36,6 +36,7 @@ class RefreshReport:
    dart: SourceStatus
    naver_news: SourceStatus
    google_rss: SourceStatus
+    finbert: SourceStatus

    def to_dict(self) -> dict[str, Any]:
        out: dict[str, Any] = {"code": self.code}
@@ -46,6 +47,7 @@ class RefreshReport:
            "dart",
            "naver_news",
            "google_rss",
+            "finbert",
        ):
            v: SourceStatus = getattr(self, f)
            out[f] = asdict(v)
@@ -132,6 +134,25 @@ def _google_rss(code: str, name: str) -> SourceStatus:
        return SourceStatus(status="failed", error=str(exc))


+def _finbert(code: str) -> SourceStatus:
+    """방금 upsert 된 뉴스 중 sentiment_score 가 비어있는 행을 KR-FinBERT 로 스코어."""
+    try:
+        from app.nlp.score_news import score_pending_news
+
+        # 한 종목에 대해 신규 뉴스가 매우 많아도 200건으로 컷.
+        # daily_batch 끝에서 잔여분을 별도로 mop-up 한다.
+        res = score_pending_news(code=code, limit=200)
+        return SourceStatus(
+            status="ok" if res.error is None else "failed",
+            inserted=res.scored,
+            skipped=res.failed,
+            extra={"fetched": res.fetched},
+            error=res.error,
+        )
+    except Exception as exc:  # noqa: BLE001
+        return SourceStatus(status="failed", error=str(exc))
+
+
 def refresh_code(code: str, name: str, *, lookback_days: int = 7) -> RefreshReport:
    """단기 갱신 (daily_batch 용). 최근 lookback_days 만 가져온다."""
    end = date.today()
@@ -144,4 +165,5 @@ def refresh_code(code: str, name: str, *, lookback_days: int = 7) -> RefreshRepo
        dart=_dart(code, start, end),
        naver_news=_naver_news(code),
        google_rss=_google_rss(code, name),
+        finbert=_finbert(code),
    )