Add explain analysis workflow and UI

This commit is contained in:
2026-03-16 22:28:41 +08:00
parent 3a5558b576
commit 1f5ee3698e
49 changed files with 8888 additions and 1476 deletions

View File

@@ -7,6 +7,7 @@ from datetime import datetime
from typing import Callable, Dict, List, Optional
import pandas as pd
from backend.data.market_store import MarketStore
from backend.data.provider_utils import normalize_symbol
from backend.data.provider_router import get_provider_router
@@ -26,6 +27,7 @@ class HistoricalPriceManager:
self.close_prices = {}
self.running = False
self._router = get_provider_router()
self._market_store = MarketStore()
def subscribe(
self,
@@ -58,21 +60,48 @@ class HistoricalPriceManager:
logger.warning(f"Failed to load CSV for {symbol}: {e}")
return None
def _load_from_market_db(
self,
symbol: str,
start_date: str,
end_date: str,
) -> Optional[pd.DataFrame]:
"""Load price data from the long-lived market research database."""
try:
rows = self._market_store.get_ohlc(symbol, start_date, end_date)
if not rows:
return None
df = pd.DataFrame(rows)
if df.empty or "date" not in df.columns:
return None
df["Date"] = pd.to_datetime(df["date"])
df.set_index("Date", inplace=True)
df.sort_index(inplace=True)
return df
except Exception as e:
logger.warning(f"Failed to load market DB data for {symbol}: {e}")
return None
def preload_data(self, start_date: str, end_date: str):
"""Preload historical data from local CSV files."""
"""Preload historical data from market DB first, then local CSV."""
logger.info(f"Preloading data: {start_date} to {end_date}")
for symbol in self.subscribed_symbols:
if symbol in self._price_cache:
continue
# Load from local CSV file directly
df = self._load_from_market_db(symbol, start_date, end_date)
if df is not None and not df.empty:
self._price_cache[symbol] = df
logger.info(f"Loaded {symbol} from market DB: {len(df)} records")
continue
df = self._load_from_csv(symbol)
if df is not None and not df.empty:
self._price_cache[symbol] = df
logger.info(f"Loaded {symbol} from CSV: {len(df)} records")
else:
logger.warning(f"No CSV data for {symbol}")
logger.warning(f"No market DB or CSV data for {symbol}")
def set_date(self, date: str):
"""Set current trading date and update prices"""

View File

@@ -0,0 +1,149 @@
# -*- coding: utf-8 -*-
"""Ingest Polygon market data into the long-lived research warehouse."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Iterable
from backend.data.market_store import MarketStore
from backend.data.news_alignment import align_news_for_symbol
from backend.data.polygon_client import (
fetch_news,
fetch_ohlc,
fetch_ticker_details,
)
from backend.data.provider_utils import normalize_symbol
def _today_utc() -> str:
return datetime.now(timezone.utc).date().isoformat()
def _default_start(years: int = 2) -> str:
return (datetime.now(timezone.utc).date() - timedelta(days=years * 366)).isoformat()
def ingest_ticker_history(
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
store: MarketStore | None = None,
) -> dict:
"""Fetch and persist Polygon OHLC + news for a ticker."""
ticker = normalize_symbol(symbol)
start = start_date or _default_start()
end = end_date or _today_utc()
market_store = store or MarketStore()
details = fetch_ticker_details(ticker)
market_store.upsert_ticker(
symbol=ticker,
name=details.get("name"),
sector=details.get("sic_description"),
is_active=bool(details.get("active", True)),
)
ohlc_rows = fetch_ohlc(ticker, start, end)
news_rows = fetch_news(ticker, start, end)
price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon")
news_count = market_store.upsert_news(ticker, news_rows, source="polygon")
aligned_count = align_news_for_symbol(market_store, ticker)
market_store.update_fetch_watermark(symbol=ticker, price_date=end, news_date=end)
return {
"symbol": ticker,
"start_date": start,
"end_date": end,
"prices": price_count,
"news": news_count,
"aligned": aligned_count,
}
def update_ticker_incremental(
symbol: str,
*,
end_date: str | None = None,
store: MarketStore | None = None,
) -> dict:
"""Incrementally fetch OHLC + news since the last watermark."""
ticker = normalize_symbol(symbol)
market_store = store or MarketStore()
watermarks = market_store.get_ticker_watermarks(ticker)
end = end_date or _today_utc()
start_prices = (
(datetime.fromisoformat(watermarks["last_price_fetch"]) + timedelta(days=1)).date().isoformat()
if watermarks.get("last_price_fetch")
else _default_start()
)
start_news = (
(datetime.fromisoformat(watermarks["last_news_fetch"]) + timedelta(days=1)).date().isoformat()
if watermarks.get("last_news_fetch")
else _default_start()
)
details = fetch_ticker_details(ticker)
market_store.upsert_ticker(
symbol=ticker,
name=details.get("name"),
sector=details.get("sic_description"),
is_active=bool(details.get("active", True)),
)
ohlc_rows = [] if start_prices > end else fetch_ohlc(ticker, start_prices, end)
news_rows = [] if start_news > end else fetch_news(ticker, start_news, end)
price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon") if ohlc_rows else 0
news_count = market_store.upsert_news(ticker, news_rows, source="polygon") if news_rows else 0
aligned_count = align_news_for_symbol(market_store, ticker)
market_store.update_fetch_watermark(
symbol=ticker,
price_date=end if ohlc_rows or watermarks.get("last_price_fetch") else None,
news_date=end if news_rows or watermarks.get("last_news_fetch") else None,
)
return {
"symbol": ticker,
"start_price_date": start_prices,
"start_news_date": start_news,
"end_date": end,
"prices": price_count,
"news": news_count,
"aligned": aligned_count,
}
def ingest_symbols(
symbols: Iterable[str],
*,
mode: str = "incremental",
start_date: str | None = None,
end_date: str | None = None,
store: MarketStore | None = None,
) -> list[dict]:
"""Fetch Polygon data for a list of tickers."""
market_store = store or MarketStore()
results = []
for symbol in symbols:
ticker = normalize_symbol(symbol)
if not ticker:
continue
if mode == "full":
results.append(
ingest_ticker_history(
ticker,
start_date=start_date,
end_date=end_date,
store=market_store,
)
)
else:
results.append(
update_ticker_incremental(
ticker,
end_date=end_date,
store=market_store,
)
)
return results

1074
backend/data/market_store.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
"""Align persisted news to the nearest NYSE trading date."""
from __future__ import annotations
from datetime import time
import pandas as pd
import pandas_market_calendars as mcal
from backend.data.market_store import MarketStore
NYSE_CALENDAR = mcal.get_calendar("NYSE")
def _next_trading_day(date_str: str) -> str:
start = pd.Timestamp(date_str).tz_localize(None)
sessions = NYSE_CALENDAR.valid_days(
start_date=(start - pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
end_date=(start + pd.Timedelta(days=10)).strftime("%Y-%m-%d"),
)
future = [
pd.Timestamp(day).tz_localize(None).strftime("%Y-%m-%d")
for day in sessions
if pd.Timestamp(day).tz_localize(None) >= start
]
return future[0] if future else date_str
def resolve_trade_date(published_utc: str | None) -> str | None:
"""Map a published timestamp to an NYSE trade date."""
if not published_utc:
return None
timestamp = pd.to_datetime(published_utc, utc=True, errors="coerce")
if pd.isna(timestamp):
return None
nyse_time = timestamp.tz_convert("America/New_York")
candidate = nyse_time.date().isoformat()
valid_days = NYSE_CALENDAR.valid_days(start_date=candidate, end_date=candidate)
if len(valid_days) == 0:
return _next_trading_day(candidate)
if nyse_time.time() >= time(16, 0):
return _next_trading_day((nyse_time + pd.Timedelta(days=1)).date().isoformat())
return candidate
def align_news_for_symbol(store: MarketStore, symbol: str, *, limit: int = 5000) -> int:
"""Fill missing trade_date values for one ticker."""
pending = store.get_news_without_trade_date(symbol, limit=limit)
updates = []
for row in pending:
trade_date = resolve_trade_date(row.get("published_utc"))
if trade_date:
updates.append(
{
"news_id": row["news_id"],
"symbol": row["symbol"],
"trade_date": trade_date,
}
)
if not updates:
return 0
return store.set_trade_dates(updates)

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""Polygon client used for long-lived market research ingestion."""
from __future__ import annotations
import os
import time
from datetime import datetime, timezone
from typing import Any, Optional
import requests
BASE = "https://api.polygon.io"
def _headers() -> dict[str, str]:
api_key = os.getenv("POLYGON_API_KEY", "").strip()
if not api_key:
raise ValueError("Missing required API key: POLYGON_API_KEY")
return {"Authorization": f"Bearer {api_key}"}
def http_get(
url: str,
params: Optional[dict[str, Any]] = None,
*,
max_retries: int = 8,
backoff: float = 2.0,
) -> requests.Response:
"""HTTP GET with exponential backoff and 429 handling."""
for attempt in range(max_retries):
try:
response = requests.get(
url,
params=params or {},
headers=_headers(),
timeout=30,
)
except requests.RequestException:
time.sleep((backoff**attempt) + 0.5)
if attempt == max_retries - 1:
raise
continue
if response.status_code == 429:
retry_after = response.headers.get("Retry-After")
wait = (
float(retry_after)
if retry_after and retry_after.isdigit()
else min((backoff**attempt) + 1.0, 60.0)
)
time.sleep(wait)
if attempt == max_retries - 1:
response.raise_for_status()
continue
if 500 <= response.status_code < 600:
time.sleep(min((backoff**attempt) + 1.0, 60.0))
if attempt == max_retries - 1:
response.raise_for_status()
continue
response.raise_for_status()
return response
raise RuntimeError("Unreachable")
def fetch_ticker_details(symbol: str) -> dict[str, Any]:
"""Fetch company metadata from Polygon."""
response = http_get(f"{BASE}/v3/reference/tickers/{symbol}")
return response.json().get("results", {}) or {}
def fetch_ohlc(symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
"""Fetch daily OHLC data from Polygon."""
response = http_get(
f"{BASE}/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}",
params={"adjusted": "true", "sort": "asc", "limit": 50000},
)
results = response.json().get("results") or []
rows: list[dict[str, Any]] = []
for item in results:
rows.append(
{
"date": datetime.fromtimestamp(
int(item["t"]) / 1000,
tz=timezone.utc,
).date().isoformat(),
"open": item.get("o"),
"high": item.get("h"),
"low": item.get("l"),
"close": item.get("c"),
"volume": item.get("v"),
"vwap": item.get("vw"),
"transactions": item.get("n"),
}
)
return rows
def fetch_news(
symbol: str,
start_date: str,
end_date: str,
*,
per_page: int = 50,
page_sleep: float = 1.2,
max_pages: Optional[int] = None,
) -> list[dict[str, Any]]:
"""Fetch all Polygon news for a ticker, with pagination."""
url = f"{BASE}/v2/reference/news"
params = {
"ticker": symbol,
"published_utc.gte": start_date,
"published_utc.lte": end_date,
"limit": per_page,
"order": "asc",
}
next_url: Optional[str] = None
pages = 0
all_articles: list[dict[str, Any]] = []
seen_ids: set[str] = set()
while True:
response = http_get(next_url or url, params=None if next_url else params)
data = response.json()
results = data.get("results") or []
if not results:
break
for item in results:
article_id = item.get("id")
if article_id and article_id in seen_ids:
continue
all_articles.append(
{
"id": article_id,
"publisher": (item.get("publisher") or {}).get("name"),
"title": item.get("title"),
"author": item.get("author"),
"published_utc": item.get("published_utc"),
"amp_url": item.get("amp_url"),
"article_url": item.get("article_url"),
"tickers": item.get("tickers"),
"description": item.get("description"),
"insights": item.get("insights"),
}
)
if article_id:
seen_ids.add(article_id)
next_url = data.get("next_url")
pages += 1
if max_pages is not None and pages >= max_pages:
break
if not next_url:
break
time.sleep(page_sleep)
return all_articles