stock_chart_site/backend/app/fetch/news.py

"""뉴스 수집: 네이버 금융 종목 페이지 + 구글 뉴스 RSS.

차단 위험 줄이려고 User-Agent + timeout + retry. URL unique 제약으로 dedupe.
"""
from __future__ import annotations

import logging
import re
from dataclasses import dataclass
from datetime import datetime, timezone, timedelta
from typing import Any

import feedparser
import httpx
from bs4 import BeautifulSoup
from sqlalchemy import text
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential

from app.db.connection import get_engine

logger = logging.getLogger(__name__)
USER_AGENT = "Mozilla/5.0 (compatible; stock_chart_site/0.1; +personal)"
KST = timezone(timedelta(hours=9))


@dataclass
class NewsItem:
    code: str | None
    source: str
    title: str
    url: str
    published_at: datetime
    body: str | None = None


@retry(
    stop=stop_after_attempt(2),
    wait=wait_exponential(multiplier=1, min=1, max=4),
    retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
    reraise=True,
)
def fetch_naver_finance_news(code: str, *, max_pages: int = 1) -> list[NewsItem]:
    """네이버 금융 종목 뉴스. https://finance.naver.com/item/news_news.naver?code=005930"""
    out: list[NewsItem] = []
    for page in range(1, max_pages + 1):
        url = (
            f"https://finance.naver.com/item/news_news.naver"
            f"?code={code}&page={page}&sm=title_entity_id.basic&clusterId="
        )
        with httpx.Client(timeout=10.0, headers={"User-Agent": USER_AGENT, "Referer": "https://finance.naver.com/"}) as cli:
            resp = cli.get(url)
            resp.raise_for_status()
            html = resp.text
        soup = BeautifulSoup(html, "lxml")
        for tr in soup.select("table.type5 tr"):
            a = tr.select_one("a.tit") or tr.select_one("td.title a")
            if not a:
                continue
            link = a.get("href") or ""
            if not link:
                continue
            full_url = link if link.startswith("http") else f"https://finance.naver.com{link}"
            title = a.get_text(strip=True)
            time_td = tr.select_one("td.date")
            time_text = time_td.get_text(strip=True) if time_td else ""
            published_at = _parse_naver_time(time_text)
            if not title or not published_at:
                continue
            out.append(NewsItem(code=code, source="naver_finance", title=title, url=full_url, published_at=published_at))
    return out


def _parse_naver_time(s: str) -> datetime | None:
    s = s.strip()
    # '2026.05.20 13:24' 형태
    m = re.match(r"(\d{4})[.-](\d{2})[.-](\d{2})\s+(\d{2}):(\d{2})", s)
    if m:
        y, mo, d, h, mi = (int(x) for x in m.groups())
        return datetime(y, mo, d, h, mi, tzinfo=KST)
    return None


@retry(
    stop=stop_after_attempt(2),
    wait=wait_exponential(multiplier=1, min=1, max=4),
    retry=retry_if_exception_type((httpx.HTTPError, httpx.TimeoutException)),
    reraise=True,
)
def fetch_google_news_rss(query: str, *, code: str | None = None, hl: str = "ko", gl: str = "KR") -> list[NewsItem]:
    """Google News RSS 검색. 종목명(또는 코드)로 쿼리."""
    url = f"https://news.google.com/rss/search?q={query}&hl={hl}&gl={gl}&ceid={gl}:{hl}"
    with httpx.Client(timeout=10.0, headers={"User-Agent": USER_AGENT}) as cli:
        resp = cli.get(url)
        resp.raise_for_status()
        body = resp.text
    feed = feedparser.parse(body)
    out: list[NewsItem] = []
    for entry in feed.entries:
        title = (entry.get("title") or "").strip()
        link = entry.get("link") or ""
        if not title or not link:
            continue
        published = entry.get("published_parsed")
        if not published:
            continue
        pub_dt = datetime(*published[:6], tzinfo=timezone.utc)
        out.append(NewsItem(code=code, source="google_rss", title=title, url=link, published_at=pub_dt))
    return out


def upsert_news(items: list[NewsItem]) -> tuple[int, int]:
    """news 테이블에 upsert. (inserted, skipped) 반환."""
    if not items:
        return 0, 0
    engine = get_engine()
    inserted = skipped = 0
    with engine.begin() as conn:
        for item in items:
            res = conn.execute(
                text(
                    """
                    INSERT INTO news (code, source, published_at, title, url, body)
                    VALUES (:code, :source, :published_at, :title, :url, :body)
                    ON CONFLICT (url) DO NOTHING
                    RETURNING id
                    """
                ),
                {
                    "code": item.code,
                    "source": item.source,
                    "published_at": item.published_at,
                    "title": item.title,
                    "url": item.url,
                    "body": item.body,
                },
            )
            if res.first():
                inserted += 1
            else:
                skipped += 1
    return inserted, skipped