- backend/app/nlp/finbert.py: snunlp/KR-FinBert-SC 어댑터. - score = P(pos) - P(neg) ∈ [-1, +1], label = argmax (neg/neu/pos) - 768d mean-pooled last hidden state → news.embedding (VECTOR) 저장 - settings.huggingface_token 인증, lazy singleton, cuda/cpu auto - backend/app/nlp/score_news.py: news 테이블에서 sentiment_score IS NULL 행을 배치 스코어 → UPDATE (... embedding=(:e)::vector). 종목 필터 + limit 옵션. - backend/app/db/migrations/002_sentiment_view.sql: v_sentiment_daily 뷰. 종목·KST 일별 n_articles, mean_score, pos/neg/neu_ratio, weighted_score (naver_finance 1.0 / google_rss 0.7 / dart 0.5). - backend/app/db/migrate.py: 이미 실행 중인 DB 에 새 SQL 마이그레이션 적용용 CLI. 모든 SQL 파일은 idempotent. - refresh_one.py: refresh 끝에 종목당 200건까지 finbert 스코어, finbert SourceStatus 를 RefreshReport 에 추가. - daily_batch.py: 모든 종목 처리 후 score_pending_news(limit=2000) 로 mop-up. 모델 캐시는 docker-compose hf_cache 볼륨(/root/.cache/huggingface). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
170 lines
5.7 KiB
Python
170 lines
5.7 KiB
Python
"""한 종목에 대해 모든 소스를 갱신 + 구조화된 status 리턴.
|
|
|
|
POST /api/refresh/{code} 와 daily_batch 둘 다 이 함수를 호출.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from dataclasses import asdict, dataclass, field
|
|
from datetime import date, timedelta
|
|
from typing import Any
|
|
|
|
from app.fetch import dart as dart_mod
|
|
from app.fetch import kis as kis_mod
|
|
from app.fetch import news as news_mod
|
|
from app.fetch import pykrx_helper
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class SourceStatus:
|
|
status: str # 'ok' / 'skipped_missing_key' / 'failed'
|
|
inserted: int = 0
|
|
updated: int = 0
|
|
skipped: int = 0
|
|
extra: dict[str, Any] = field(default_factory=dict)
|
|
error: str | None = None
|
|
|
|
|
|
@dataclass
|
|
class RefreshReport:
|
|
code: str
|
|
pykrx_ohlcv: SourceStatus
|
|
pykrx_trading_value: SourceStatus
|
|
kis_daily: SourceStatus
|
|
dart: SourceStatus
|
|
naver_news: SourceStatus
|
|
google_rss: SourceStatus
|
|
finbert: SourceStatus
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
out: dict[str, Any] = {"code": self.code}
|
|
for f in (
|
|
"pykrx_ohlcv",
|
|
"pykrx_trading_value",
|
|
"kis_daily",
|
|
"dart",
|
|
"naver_news",
|
|
"google_rss",
|
|
"finbert",
|
|
):
|
|
v: SourceStatus = getattr(self, f)
|
|
out[f] = asdict(v)
|
|
return out
|
|
|
|
|
|
def _pykrx_ohlcv(code: str, start: date, end: date) -> SourceStatus:
|
|
try:
|
|
res = pykrx_helper.fetch_ohlcv_daily(code, start, end)
|
|
return SourceStatus(
|
|
status=res.status(),
|
|
inserted=res.inserted,
|
|
updated=res.updated,
|
|
error=res.error,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _pykrx_trading(code: str, start: date, end: date) -> SourceStatus:
|
|
try:
|
|
res = pykrx_helper.fetch_trading_value(code, start, end)
|
|
return SourceStatus(
|
|
status=res.status(),
|
|
inserted=res.inserted,
|
|
updated=res.updated,
|
|
error=res.error,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _kis(code: str, start: date, end: date) -> SourceStatus:
|
|
"""KIS read-only EOD. 실제 DB 적재는 하지 않고 sanity 호출 + sample row 수만 리포트.
|
|
pykrx 와 중복 데이터이므로 KIS 는 백업/실시간 용도이고, 일별 적재는 pykrx 가 1차.
|
|
"""
|
|
try:
|
|
rows = kis_mod.fetch_daily_price(code, start, end)
|
|
return SourceStatus(status="ok", extra={"sample_rows": len(rows)})
|
|
except kis_mod.SkippedMissingKey:
|
|
return SourceStatus(status="skipped_missing_key")
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _dart(code: str, start: date, end: date) -> SourceStatus:
|
|
try:
|
|
items = dart_mod.fetch_disclosures(code, start, end)
|
|
# 공시는 news 테이블에 upsert
|
|
news_items = [
|
|
news_mod.NewsItem(
|
|
code=d.code,
|
|
source="dart",
|
|
title=d.title,
|
|
url=d.url,
|
|
published_at=d.published_at,
|
|
)
|
|
for d in items
|
|
]
|
|
ins, skip = news_mod.upsert_news(news_items)
|
|
return SourceStatus(status="ok", inserted=ins, skipped=skip, extra={"fetched": len(items)})
|
|
except dart_mod.SkippedMissingKey:
|
|
return SourceStatus(status="skipped_missing_key")
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _naver_news(code: str) -> SourceStatus:
|
|
try:
|
|
items = news_mod.fetch_naver_finance_news(code, max_pages=1)
|
|
ins, skip = news_mod.upsert_news(items)
|
|
return SourceStatus(status="ok", inserted=ins, skipped=skip, extra={"fetched": len(items)})
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _google_rss(code: str, name: str) -> SourceStatus:
|
|
try:
|
|
query = name or code
|
|
items = news_mod.fetch_google_news_rss(query, code=code)
|
|
ins, skip = news_mod.upsert_news(items)
|
|
return SourceStatus(status="ok", inserted=ins, skipped=skip, extra={"fetched": len(items)})
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def _finbert(code: str) -> SourceStatus:
|
|
"""방금 upsert 된 뉴스 중 sentiment_score 가 비어있는 행을 KR-FinBERT 로 스코어."""
|
|
try:
|
|
from app.nlp.score_news import score_pending_news
|
|
|
|
# 한 종목에 대해 신규 뉴스가 매우 많아도 200건으로 컷.
|
|
# daily_batch 끝에서 잔여분을 별도로 mop-up 한다.
|
|
res = score_pending_news(code=code, limit=200)
|
|
return SourceStatus(
|
|
status="ok" if res.error is None else "failed",
|
|
inserted=res.scored,
|
|
skipped=res.failed,
|
|
extra={"fetched": res.fetched},
|
|
error=res.error,
|
|
)
|
|
except Exception as exc: # noqa: BLE001
|
|
return SourceStatus(status="failed", error=str(exc))
|
|
|
|
|
|
def refresh_code(code: str, name: str, *, lookback_days: int = 7) -> RefreshReport:
|
|
"""단기 갱신 (daily_batch 용). 최근 lookback_days 만 가져온다."""
|
|
end = date.today()
|
|
start = end - timedelta(days=lookback_days)
|
|
return RefreshReport(
|
|
code=code,
|
|
pykrx_ohlcv=_pykrx_ohlcv(code, start, end),
|
|
pykrx_trading_value=_pykrx_trading(code, start, end),
|
|
kis_daily=_kis(code, start, end),
|
|
dart=_dart(code, start, end),
|
|
naver_news=_naver_news(code),
|
|
google_rss=_google_rss(code, name),
|
|
finbert=_finbert(code),
|
|
)
|