"""모델 학습/추론용 피처 빌더. 종목 1개 + 룩백 기간을 받아 (date 단위) DataFrame 반환: - OHLCV - returns r1 - TA: rsi14, macd, macd_signal, atr14, bb_pct, sma20, ema12, vol_z20 - trading_value: foreign_net, institution_net, individual_net (정규화 X, scale 그대로) - macro 정렬: kospi, kosdaq, usdkrw, us10y, kospi_r1, usdkrw_r1 - sentiment (v_sentiment_daily): mean_score, weighted_score, n_articles, pos_minus_neg = pos_ratio - neg_ratio. 3일 롤링 mean 도 추가. 학습 타깃 (build_features 에서만 생성): - y_close_h{1,3,5}: close.shift(-H) - y_ret_h{1,3,5}: y_close_h / close - 1 - y_dir_h{1,3,5}: sign(y_ret_h) (1=up, -1=down, 0=flat ±0.3% 이내) inference 용 build_features 는 dropna 안 함. 학습용 build_training_frame 은 dropna. """ from __future__ import annotations import logging from dataclasses import dataclass from datetime import date, timedelta import numpy as np import pandas as pd from sqlalchemy import text from app.db.connection import get_engine logger = logging.getLogger(__name__) FLAT_BAND = 0.003 # ±0.3% 이내는 flat HORIZONS_DEFAULT = (1, 3, 5) @dataclass class FeatureFrame: code: str df: pd.DataFrame target_horizons: tuple[int, ...] def _load_ohlcv(code: str, start: date, end: date) -> pd.DataFrame: eng = get_engine() sql = text( """ SELECT date, open, high, low, close, volume FROM ohlcv_daily WHERE code = :code AND date BETWEEN :s AND :e ORDER BY date """ ) with eng.connect() as conn: rows = conn.execute(sql, {"code": code, "s": start, "e": end}).all() if not rows: return pd.DataFrame(columns=["date", "open", "high", "low", "close", "volume"]) df = pd.DataFrame(rows, columns=["date", "open", "high", "low", "close", "volume"]) df["date"] = pd.to_datetime(df["date"]).dt.date return df def _load_trading(code: str, start: date, end: date) -> pd.DataFrame: eng = get_engine() sql = text( """ SELECT date, foreign_net, institution_net, individual_net FROM trading_value_daily WHERE code = :code AND date BETWEEN :s AND :e ORDER BY date """ ) with eng.connect() as conn: rows = conn.execute(sql, {"code": code, "s": start, "e": end}).all() if not rows: return pd.DataFrame(columns=["date", "foreign_net", "institution_net", "individual_net"]) df = pd.DataFrame(rows, columns=["date", "foreign_net", "institution_net", "individual_net"]) df["date"] = pd.to_datetime(df["date"]).dt.date return df def _load_macro(start: date, end: date) -> pd.DataFrame: eng = get_engine() sql = text( "SELECT date, key, value FROM macro_daily " "WHERE date BETWEEN :s AND :e ORDER BY date" ) with eng.connect() as conn: rows = conn.execute(sql, {"s": start, "e": end}).all() if not rows: return pd.DataFrame(columns=["date"]) df = pd.DataFrame(rows, columns=["date", "key", "value"]) pivot = df.pivot_table(index="date", columns="key", values="value", aggfunc="last").reset_index() pivot["date"] = pd.to_datetime(pivot["date"]).dt.date pivot.columns.name = None return pivot def _load_sentiment(code: str, start: date, end: date) -> pd.DataFrame: eng = get_engine() sql = text( """ SELECT date, n_articles, mean_score, pos_ratio, neg_ratio, weighted_score FROM v_sentiment_daily WHERE code = :code AND date BETWEEN :s AND :e ORDER BY date """ ) with eng.connect() as conn: rows = conn.execute(sql, {"code": code, "s": start, "e": end}).all() cols = ["date", "n_articles", "mean_score", "pos_ratio", "neg_ratio", "weighted_score"] if not rows: return pd.DataFrame(columns=cols) df = pd.DataFrame(rows, columns=cols) df["date"] = pd.to_datetime(df["date"]).dt.date df["pos_minus_neg"] = df["pos_ratio"].fillna(0) - df["neg_ratio"].fillna(0) return df def _add_ta(df: pd.DataFrame) -> pd.DataFrame: """ta 패키지로 기술 지표 추가.""" from ta.momentum import RSIIndicator from ta.trend import EMAIndicator, MACD, SMAIndicator from ta.volatility import AverageTrueRange, BollingerBands close = df["close"].astype(float) high = df["high"].astype(float) low = df["low"].astype(float) vol = df["volume"].astype(float) df["r1"] = close.pct_change() df["rsi14"] = RSIIndicator(close=close, window=14, fillna=False).rsi() macd = MACD(close=close, window_slow=26, window_fast=12, window_sign=9, fillna=False) df["macd"] = macd.macd() df["macd_signal"] = macd.macd_signal() df["atr14"] = AverageTrueRange(high=high, low=low, close=close, window=14, fillna=False).average_true_range() bb = BollingerBands(close=close, window=20, window_dev=2, fillna=False) df["bb_pct"] = bb.bollinger_pband() df["sma20"] = SMAIndicator(close=close, window=20, fillna=False).sma_indicator() df["ema12"] = EMAIndicator(close=close, window=12, fillna=False).ema_indicator() vol_mean = vol.rolling(20).mean() vol_std = vol.rolling(20).std().replace(0, np.nan) df["vol_z20"] = (vol - vol_mean) / vol_std return df def _add_targets(df: pd.DataFrame, horizons: tuple[int, ...]) -> pd.DataFrame: close = df["close"].astype(float) for h in horizons: df[f"y_close_h{h}"] = close.shift(-h) df[f"y_ret_h{h}"] = df[f"y_close_h{h}"] / close - 1.0 df[f"y_dir_h{h}"] = np.where( df[f"y_ret_h{h}"] > FLAT_BAND, 1, np.where(df[f"y_ret_h{h}"] < -FLAT_BAND, -1, 0), ) return df def build_features( code: str, *, lookback_days: int = 365 * 2, end_date: date | None = None, horizons: tuple[int, ...] = HORIZONS_DEFAULT, with_targets: bool = False, ) -> FeatureFrame: """code 1개 종목의 피처 DataFrame 생성. inference: with_targets=False 로 호출 → 최신 row 의 피처만 LGBM/Chronos 에 투입. training : with_targets=True 로 호출 → tail H 행은 타깃 NaN → dropna 로 제거. """ end = end_date or date.today() start = end - timedelta(days=lookback_days) ohlcv = _load_ohlcv(code, start, end) if ohlcv.empty: return FeatureFrame(code=code, df=ohlcv, target_horizons=horizons) df = ohlcv.copy().sort_values("date").reset_index(drop=True) df = _add_ta(df) trading = _load_trading(code, start, end) if not trading.empty: df = df.merge(trading, on="date", how="left") else: for col in ("foreign_net", "institution_net", "individual_net"): df[col] = np.nan macro = _load_macro(start, end) if not macro.empty: df = df.merge(macro, on="date", how="left") for k in ("kospi", "kosdaq", "usdkrw", "us10y"): if k in df.columns: df[f"{k}_r1"] = df[k].pct_change() sentiment = _load_sentiment(code, start, end) if not sentiment.empty: df = df.merge(sentiment, on="date", how="left") # 3일 롤링 평균 for col in ("mean_score", "weighted_score", "pos_minus_neg", "n_articles"): if col in df.columns: df[f"{col}_3d"] = df[col].rolling(3, min_periods=1).mean() else: for col in ("n_articles", "mean_score", "pos_ratio", "neg_ratio", "weighted_score", "pos_minus_neg"): df[col] = np.nan if with_targets: df = _add_targets(df, horizons) return FeatureFrame(code=code, df=df, target_horizons=horizons) def feature_columns(df: pd.DataFrame) -> list[str]: """LGBM 학습/추론용 피처 컬럼 목록. date / OHLCV / y_* 제외.""" drop = {"date", "open", "high", "low", "close", "volume"} cols = [ c for c in df.columns if c not in drop and not c.startswith("y_") ] return cols