Add explain analysis workflow and UI

2026-03-16 22:28:41 +08:00
parent 3a5558b576
commit 1f5ee3698e
49 changed files with 8888 additions and 1476 deletions
--- a/backend/data/historical_price_manager.py
+++ b/backend/data/historical_price_manager.py
@@ -7,6 +7,7 @@ from datetime import datetime
 from typing import Callable, Dict, List, Optional

 import pandas as pd
+from backend.data.market_store import MarketStore
 from backend.data.provider_utils import normalize_symbol
 from backend.data.provider_router import get_provider_router

@@ -26,6 +27,7 @@ class HistoricalPriceManager:
        self.close_prices = {}
        self.running = False
        self._router = get_provider_router()
+        self._market_store = MarketStore()

    def subscribe(
        self,
@@ -58,21 +60,48 @@ class HistoricalPriceManager:
            logger.warning(f"Failed to load CSV for {symbol}: {e}")
            return None

+    def _load_from_market_db(
+        self,
+        symbol: str,
+        start_date: str,
+        end_date: str,
+    ) -> Optional[pd.DataFrame]:
+        """Load price data from the long-lived market research database."""
+        try:
+            rows = self._market_store.get_ohlc(symbol, start_date, end_date)
+            if not rows:
+                return None
+            df = pd.DataFrame(rows)
+            if df.empty or "date" not in df.columns:
+                return None
+            df["Date"] = pd.to_datetime(df["date"])
+            df.set_index("Date", inplace=True)
+            df.sort_index(inplace=True)
+            return df
+        except Exception as e:
+            logger.warning(f"Failed to load market DB data for {symbol}: {e}")
+            return None
+
    def preload_data(self, start_date: str, end_date: str):
-        """Preload historical data from local CSV files."""
+        """Preload historical data from market DB first, then local CSV."""
        logger.info(f"Preloading data: {start_date} to {end_date}")

        for symbol in self.subscribed_symbols:
            if symbol in self._price_cache:
                continue

-            # Load from local CSV file directly
+            df = self._load_from_market_db(symbol, start_date, end_date)
+            if df is not None and not df.empty:
+                self._price_cache[symbol] = df
+                logger.info(f"Loaded {symbol} from market DB: {len(df)} records")
+                continue
+
            df = self._load_from_csv(symbol)
            if df is not None and not df.empty:
                self._price_cache[symbol] = df
                logger.info(f"Loaded {symbol} from CSV: {len(df)} records")
            else:
-                logger.warning(f"No CSV data for {symbol}")
+                logger.warning(f"No market DB or CSV data for {symbol}")

    def set_date(self, date: str):
        """Set current trading date and update prices"""
--- a/backend/data/market_ingest.py
+++ b/backend/data/market_ingest.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+"""Ingest Polygon market data into the long-lived research warehouse."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+from typing import Iterable
+
+from backend.data.market_store import MarketStore
+from backend.data.news_alignment import align_news_for_symbol
+from backend.data.polygon_client import (
+    fetch_news,
+    fetch_ohlc,
+    fetch_ticker_details,
+)
+from backend.data.provider_utils import normalize_symbol
+
+
+def _today_utc() -> str:
+    return datetime.now(timezone.utc).date().isoformat()
+
+
+def _default_start(years: int = 2) -> str:
+    return (datetime.now(timezone.utc).date() - timedelta(days=years * 366)).isoformat()
+
+
+def ingest_ticker_history(
+    symbol: str,
+    *,
+    start_date: str | None = None,
+    end_date: str | None = None,
+    store: MarketStore | None = None,
+) -> dict:
+    """Fetch and persist Polygon OHLC + news for a ticker."""
+    ticker = normalize_symbol(symbol)
+    start = start_date or _default_start()
+    end = end_date or _today_utc()
+    market_store = store or MarketStore()
+
+    details = fetch_ticker_details(ticker)
+    market_store.upsert_ticker(
+        symbol=ticker,
+        name=details.get("name"),
+        sector=details.get("sic_description"),
+        is_active=bool(details.get("active", True)),
+    )
+
+    ohlc_rows = fetch_ohlc(ticker, start, end)
+    news_rows = fetch_news(ticker, start, end)
+    price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon")
+    news_count = market_store.upsert_news(ticker, news_rows, source="polygon")
+    aligned_count = align_news_for_symbol(market_store, ticker)
+    market_store.update_fetch_watermark(symbol=ticker, price_date=end, news_date=end)
+
+    return {
+        "symbol": ticker,
+        "start_date": start,
+        "end_date": end,
+        "prices": price_count,
+        "news": news_count,
+        "aligned": aligned_count,
+    }
+
+
+def update_ticker_incremental(
+    symbol: str,
+    *,
+    end_date: str | None = None,
+    store: MarketStore | None = None,
+) -> dict:
+    """Incrementally fetch OHLC + news since the last watermark."""
+    ticker = normalize_symbol(symbol)
+    market_store = store or MarketStore()
+    watermarks = market_store.get_ticker_watermarks(ticker)
+    end = end_date or _today_utc()
+    start_prices = (
+        (datetime.fromisoformat(watermarks["last_price_fetch"]) + timedelta(days=1)).date().isoformat()
+        if watermarks.get("last_price_fetch")
+        else _default_start()
+    )
+    start_news = (
+        (datetime.fromisoformat(watermarks["last_news_fetch"]) + timedelta(days=1)).date().isoformat()
+        if watermarks.get("last_news_fetch")
+        else _default_start()
+    )
+
+    details = fetch_ticker_details(ticker)
+    market_store.upsert_ticker(
+        symbol=ticker,
+        name=details.get("name"),
+        sector=details.get("sic_description"),
+        is_active=bool(details.get("active", True)),
+    )
+
+    ohlc_rows = [] if start_prices > end else fetch_ohlc(ticker, start_prices, end)
+    news_rows = [] if start_news > end else fetch_news(ticker, start_news, end)
+    price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon") if ohlc_rows else 0
+    news_count = market_store.upsert_news(ticker, news_rows, source="polygon") if news_rows else 0
+    aligned_count = align_news_for_symbol(market_store, ticker)
+    market_store.update_fetch_watermark(
+        symbol=ticker,
+        price_date=end if ohlc_rows or watermarks.get("last_price_fetch") else None,
+        news_date=end if news_rows or watermarks.get("last_news_fetch") else None,
+    )
+
+    return {
+        "symbol": ticker,
+        "start_price_date": start_prices,
+        "start_news_date": start_news,
+        "end_date": end,
+        "prices": price_count,
+        "news": news_count,
+        "aligned": aligned_count,
+    }
+
+
+def ingest_symbols(
+    symbols: Iterable[str],
+    *,
+    mode: str = "incremental",
+    start_date: str | None = None,
+    end_date: str | None = None,
+    store: MarketStore | None = None,
+) -> list[dict]:
+    """Fetch Polygon data for a list of tickers."""
+    market_store = store or MarketStore()
+    results = []
+    for symbol in symbols:
+        ticker = normalize_symbol(symbol)
+        if not ticker:
+            continue
+        if mode == "full":
+            results.append(
+                ingest_ticker_history(
+                    ticker,
+                    start_date=start_date,
+                    end_date=end_date,
+                    store=market_store,
+                )
+            )
+        else:
+            results.append(
+                update_ticker_incremental(
+                    ticker,
+                    end_date=end_date,
+                    store=market_store,
+                )
+            )
+    return results
--- a/backend/data/market_store.py
+++ b/backend/data/market_store.py
--- a/backend/data/news_alignment.py
+++ b/backend/data/news_alignment.py
@@ -0,0 +1,64 @@
+# -*- coding: utf-8 -*-
+"""Align persisted news to the nearest NYSE trading date."""
+
+from __future__ import annotations
+
+from datetime import time
+
+import pandas as pd
+import pandas_market_calendars as mcal
+
+from backend.data.market_store import MarketStore
+
+
+NYSE_CALENDAR = mcal.get_calendar("NYSE")
+
+
+def _next_trading_day(date_str: str) -> str:
+    start = pd.Timestamp(date_str).tz_localize(None)
+    sessions = NYSE_CALENDAR.valid_days(
+        start_date=(start - pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
+        end_date=(start + pd.Timedelta(days=10)).strftime("%Y-%m-%d"),
+    )
+    future = [
+        pd.Timestamp(day).tz_localize(None).strftime("%Y-%m-%d")
+        for day in sessions
+        if pd.Timestamp(day).tz_localize(None) >= start
+    ]
+    return future[0] if future else date_str
+
+
+def resolve_trade_date(published_utc: str | None) -> str | None:
+    """Map a published timestamp to an NYSE trade date."""
+    if not published_utc:
+        return None
+    timestamp = pd.to_datetime(published_utc, utc=True, errors="coerce")
+    if pd.isna(timestamp):
+        return None
+    nyse_time = timestamp.tz_convert("America/New_York")
+    candidate = nyse_time.date().isoformat()
+    valid_days = NYSE_CALENDAR.valid_days(start_date=candidate, end_date=candidate)
+    if len(valid_days) == 0:
+        return _next_trading_day(candidate)
+    if nyse_time.time() >= time(16, 0):
+        return _next_trading_day((nyse_time + pd.Timedelta(days=1)).date().isoformat())
+    return candidate
+
+
+def align_news_for_symbol(store: MarketStore, symbol: str, *, limit: int = 5000) -> int:
+    """Fill missing trade_date values for one ticker."""
+    pending = store.get_news_without_trade_date(symbol, limit=limit)
+    updates = []
+    for row in pending:
+        trade_date = resolve_trade_date(row.get("published_utc"))
+        if trade_date:
+            updates.append(
+                {
+                    "news_id": row["news_id"],
+                    "symbol": row["symbol"],
+                    "trade_date": trade_date,
+                }
+            )
+    if not updates:
+        return 0
+    return store.set_trade_dates(updates)
--- a/backend/data/polygon_client.py
+++ b/backend/data/polygon_client.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+"""Polygon client used for long-lived market research ingestion."""
+
+from __future__ import annotations
+
+import os
+import time
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+import requests
+
+
+BASE = "https://api.polygon.io"
+
+
+def _headers() -> dict[str, str]:
+    api_key = os.getenv("POLYGON_API_KEY", "").strip()
+    if not api_key:
+        raise ValueError("Missing required API key: POLYGON_API_KEY")
+    return {"Authorization": f"Bearer {api_key}"}
+
+
+def http_get(
+    url: str,
+    params: Optional[dict[str, Any]] = None,
+    *,
+    max_retries: int = 8,
+    backoff: float = 2.0,
+) -> requests.Response:
+    """HTTP GET with exponential backoff and 429 handling."""
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(
+                url,
+                params=params or {},
+                headers=_headers(),
+                timeout=30,
+            )
+        except requests.RequestException:
+            time.sleep((backoff**attempt) + 0.5)
+            if attempt == max_retries - 1:
+                raise
+            continue
+
+        if response.status_code == 429:
+            retry_after = response.headers.get("Retry-After")
+            wait = (
+                float(retry_after)
+                if retry_after and retry_after.isdigit()
+                else min((backoff**attempt) + 1.0, 60.0)
+            )
+            time.sleep(wait)
+            if attempt == max_retries - 1:
+                response.raise_for_status()
+            continue
+
+        if 500 <= response.status_code < 600:
+            time.sleep(min((backoff**attempt) + 1.0, 60.0))
+            if attempt == max_retries - 1:
+                response.raise_for_status()
+            continue
+
+        response.raise_for_status()
+        return response
+    raise RuntimeError("Unreachable")
+
+
+def fetch_ticker_details(symbol: str) -> dict[str, Any]:
+    """Fetch company metadata from Polygon."""
+    response = http_get(f"{BASE}/v3/reference/tickers/{symbol}")
+    return response.json().get("results", {}) or {}
+
+
+def fetch_ohlc(symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
+    """Fetch daily OHLC data from Polygon."""
+    response = http_get(
+        f"{BASE}/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}",
+        params={"adjusted": "true", "sort": "asc", "limit": 50000},
+    )
+    results = response.json().get("results") or []
+    rows: list[dict[str, Any]] = []
+    for item in results:
+        rows.append(
+            {
+                "date": datetime.fromtimestamp(
+                    int(item["t"]) / 1000,
+                    tz=timezone.utc,
+                ).date().isoformat(),
+                "open": item.get("o"),
+                "high": item.get("h"),
+                "low": item.get("l"),
+                "close": item.get("c"),
+                "volume": item.get("v"),
+                "vwap": item.get("vw"),
+                "transactions": item.get("n"),
+            }
+        )
+    return rows
+
+
+def fetch_news(
+    symbol: str,
+    start_date: str,
+    end_date: str,
+    *,
+    per_page: int = 50,
+    page_sleep: float = 1.2,
+    max_pages: Optional[int] = None,
+) -> list[dict[str, Any]]:
+    """Fetch all Polygon news for a ticker, with pagination."""
+    url = f"{BASE}/v2/reference/news"
+    params = {
+        "ticker": symbol,
+        "published_utc.gte": start_date,
+        "published_utc.lte": end_date,
+        "limit": per_page,
+        "order": "asc",
+    }
+    next_url: Optional[str] = None
+    pages = 0
+    all_articles: list[dict[str, Any]] = []
+    seen_ids: set[str] = set()
+
+    while True:
+        response = http_get(next_url or url, params=None if next_url else params)
+        data = response.json()
+        results = data.get("results") or []
+        if not results:
+            break
+
+        for item in results:
+            article_id = item.get("id")
+            if article_id and article_id in seen_ids:
+                continue
+            all_articles.append(
+                {
+                    "id": article_id,
+                    "publisher": (item.get("publisher") or {}).get("name"),
+                    "title": item.get("title"),
+                    "author": item.get("author"),
+                    "published_utc": item.get("published_utc"),
+                    "amp_url": item.get("amp_url"),
+                    "article_url": item.get("article_url"),
+                    "tickers": item.get("tickers"),
+                    "description": item.get("description"),
+                    "insights": item.get("insights"),
+                }
+            )
+            if article_id:
+                seen_ids.add(article_id)
+
+        next_url = data.get("next_url")
+        pages += 1
+        if max_pages is not None and pages >= max_pages:
+            break
+        if not next_url:
+            break
+        time.sleep(page_sleep)
+
+    return all_articles