Initial commit of integrated agent system

This commit is contained in:
cillin
2026-03-30 17:46:44 +08:00
commit 0fa413380c
337 changed files with 75268 additions and 0 deletions

5
backend/data/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-
from backend.data.historical_price_manager import HistoricalPriceManager
from backend.data.polling_price_manager import PollingPriceManager
__all__ = ["PollingPriceManager", "HistoricalPriceManager"]

107
backend/data/cache.py Normal file
View File

@@ -0,0 +1,107 @@
# -*- coding: utf-8 -*-
from typing_extensions import Any
class Cache:
"""In-memory cache for API responses."""
def __init__(self):
self._prices_cache = {}
self._financial_metrics_cache = {}
self._line_items_cache = {}
self._insider_trades_cache = {}
self._company_news_cache = {}
def _merge_data(
self,
existing: list[dict] | None,
new_data: list[dict],
key_field: str,
) -> list[dict]:
"""Merge existing and new data"""
if not existing:
return new_data
# Create a set of existing keys for O(1) lookup
existing_keys = {item[key_field] for item in existing}
# Only add items that don't exist yet
merged = existing.copy()
merged.extend(
[
item
for item in new_data
if item[key_field] not in existing_keys
],
)
return merged
def get_prices(self, ticker: str) -> list[dict[str, Any]] | None:
"""Get cached price data if available."""
return self._prices_cache.get(ticker)
def set_prices(self, ticker: str, data: list[dict[str, Any]]):
"""Append new price data to cache."""
self._prices_cache[ticker] = self._merge_data(
self._prices_cache.get(ticker),
data,
key_field="time",
)
def get_financial_metrics(self, ticker: str) -> list[dict[str, Any]]:
"""Get cached financial metrics if available."""
return self._financial_metrics_cache.get(ticker)
def set_financial_metrics(self, ticker: str, data: list[dict[str, Any]]):
"""Append new financial metrics to cache."""
self._financial_metrics_cache[ticker] = self._merge_data(
self._financial_metrics_cache.get(ticker),
data,
key_field="report_period",
)
def get_line_items(self, ticker: str) -> list[dict[str, Any]] | None:
"""Get cached line items if available."""
return self._line_items_cache.get(ticker)
def set_line_items(self, ticker: str, data: list[dict[str, Any]]):
"""Append new line items to cache."""
self._line_items_cache[ticker] = self._merge_data(
self._line_items_cache.get(ticker),
data,
key_field="report_period",
)
def get_insider_trades(self, ticker: str) -> list[dict[str, Any]] | None:
"""Get cached insider trades if available."""
return self._insider_trades_cache.get(ticker)
def set_insider_trades(self, ticker: str, data: list[dict[str, Any]]):
"""Append new insider trades to cache."""
self._insider_trades_cache[ticker] = self._merge_data(
self._insider_trades_cache.get(ticker),
data,
key_field="filing_date",
) # Could also use transaction_date if preferred
def get_company_news(self, ticker: str) -> list[dict[str, Any]] | None:
"""Get cached company news if available."""
return self._company_news_cache.get(ticker)
def set_company_news(self, ticker: str, data: list[dict[str, Any]]):
"""Append new company news to cache."""
self._company_news_cache[ticker] = self._merge_data(
self._company_news_cache.get(ticker),
data,
key_field="date",
)
# Global cache instance
_cache = Cache()
def get_cache() -> Cache:
"""Get the global cache instance."""
return _cache

View File

@@ -0,0 +1,253 @@
# -*- coding: utf-8 -*-
"""
Historical Price Manager for backtest mode
"""
import logging
from datetime import datetime
from typing import Callable, Dict, List, Optional
import pandas as pd
from backend.data.market_store import MarketStore
from backend.data.provider_utils import normalize_symbol
from backend.data.provider_router import get_provider_router
logger = logging.getLogger(__name__)
class HistoricalPriceManager:
"""Provides historical prices for backtest mode"""
def __init__(self):
self.subscribed_symbols = []
self.price_callbacks = []
self._price_cache = {}
self._current_date = None
self.latest_prices = {}
self.open_prices = {}
self.close_prices = {}
self.running = False
self._router = get_provider_router()
self._market_store = MarketStore()
def subscribe(
self,
symbols: List[str],
):
"""Subscribe to symbols"""
for symbol in symbols:
symbol = normalize_symbol(symbol)
if symbol not in self.subscribed_symbols:
self.subscribed_symbols.append(symbol)
def unsubscribe(self, symbols: List[str]):
"""Unsubscribe from symbols"""
for symbol in symbols:
symbol = normalize_symbol(symbol)
if symbol in self.subscribed_symbols:
self.subscribed_symbols.remove(symbol)
self._price_cache.pop(symbol, None)
def add_price_callback(self, callback: Callable):
"""Add price update callback"""
self.price_callbacks.append(callback)
def _load_from_csv(self, symbol: str) -> Optional[pd.DataFrame]:
"""Load price data from local CSV file."""
try:
df = self._router.load_local_price_frame(symbol)
return df if not df.empty else None
except Exception as e:
logger.warning(f"Failed to load CSV for {symbol}: {e}")
return None
def _load_from_market_db(
self,
symbol: str,
start_date: str,
end_date: str,
) -> Optional[pd.DataFrame]:
"""Load price data from the long-lived market research database."""
try:
rows = self._market_store.get_ohlc(symbol, start_date, end_date)
if not rows:
return None
df = pd.DataFrame(rows)
if df.empty or "date" not in df.columns:
return None
df["Date"] = pd.to_datetime(df["date"])
df.set_index("Date", inplace=True)
df.sort_index(inplace=True)
return df
except Exception as e:
logger.warning(f"Failed to load market DB data for {symbol}: {e}")
return None
def preload_data(self, start_date: str, end_date: str):
"""Preload historical data from market DB first, then local CSV."""
logger.info(f"Preloading data: {start_date} to {end_date}")
for symbol in self.subscribed_symbols:
if symbol in self._price_cache:
continue
df = self._load_from_market_db(symbol, start_date, end_date)
if df is not None and not df.empty:
self._price_cache[symbol] = df
logger.info(f"Loaded {symbol} from market DB: {len(df)} records")
continue
df = self._load_from_csv(symbol)
if df is not None and not df.empty:
self._price_cache[symbol] = df
logger.info(f"Loaded {symbol} from CSV: {len(df)} records")
else:
logger.warning(f"No market DB or CSV data for {symbol}")
def set_date(self, date: str):
"""Set current trading date and update prices"""
self._current_date = date
date_dt = pd.Timestamp(date)
for symbol in self.subscribed_symbols:
df = self._price_cache.get(symbol)
if df is None or df.empty:
# Keep previous prices if no data available
logger.warning(f"No cached data for {symbol} on {date}")
continue
# Find exact date or closest earlier date
if date_dt in df.index:
row = df.loc[date_dt]
else:
valid_dates = df.index[df.index <= date_dt]
if len(valid_dates) == 0:
logger.warning(f"No data for {symbol} on or before {date}")
continue
row = df.loc[valid_dates[-1]]
open_price = float(row["open"])
close_price = float(row["close"])
self.open_prices[symbol] = open_price
self.close_prices[symbol] = close_price
self.latest_prices[symbol] = open_price
logger.debug(
f"{symbol} @ {date}: open={open_price:.2f}, close={close_price:.2f}", # noqa: E501
)
def emit_open_prices(self):
"""Emit open prices to callbacks"""
if not self._current_date:
return
timestamp = int(
datetime.strptime(self._current_date, "%Y-%m-%d").timestamp()
* 1000,
)
for symbol in self.subscribed_symbols:
price = self.open_prices.get(symbol)
if price is None or price <= 0:
logger.warning(f"Invalid open price for {symbol}: {price}")
continue
self.latest_prices[symbol] = price
self._emit_price(symbol, price, timestamp)
def emit_close_prices(self):
"""Emit close prices to callbacks"""
if not self._current_date:
return
timestamp = int(
datetime.strptime(self._current_date, "%Y-%m-%d").timestamp()
* 1000,
)
timestamp += 23400000 # Add 6.5 hours
for symbol in self.subscribed_symbols:
price = self.close_prices.get(symbol)
if price is None or price <= 0:
logger.warning(f"Invalid close price for {symbol}: {price}")
continue
self.latest_prices[symbol] = price
self._emit_price(symbol, price, timestamp)
def _emit_price(self, symbol: str, price: float, timestamp: int):
"""Emit single price to callbacks"""
open_price = self.open_prices.get(symbol, price)
close_price = self.close_prices.get(symbol, price)
ret = (
((price - open_price) / open_price) * 100 if open_price > 0 else 0
)
price_data = {
"symbol": symbol,
"price": price,
"timestamp": timestamp,
"open": open_price,
"close": close_price,
"high": max(open_price, close_price),
"low": min(open_price, close_price),
"ret": ret,
}
for callback in self.price_callbacks:
try:
callback(price_data)
except Exception as e:
logger.error(f"Callback error for {symbol}: {e}")
def get_price_for_date(
self,
symbol: str,
date: str,
price_type: str = "close",
) -> Optional[float]:
"""Get price for a specific date"""
df = self._price_cache.get(symbol)
if df is None or df.empty:
return self.latest_prices.get(symbol)
date_dt = pd.Timestamp(date)
if date_dt in df.index:
return float(df.loc[date_dt, price_type])
valid_dates = df.index[df.index <= date_dt]
if len(valid_dates) == 0:
return self.latest_prices.get(symbol)
return float(df.loc[valid_dates[-1], price_type])
def start(self):
"""Start manager"""
self.running = True
def stop(self):
"""Stop manager"""
self.running = False
def get_latest_price(self, symbol: str) -> Optional[float]:
return self.latest_prices.get(symbol)
def get_all_latest_prices(self) -> Dict[str, float]:
return self.latest_prices.copy()
def get_open_price(self, symbol: str) -> Optional[float]:
# Return open price, fallback to latest if not set
price = self.open_prices.get(symbol)
if price is None or price <= 0:
return self.latest_prices.get(symbol)
return price
def get_close_price(self, symbol: str) -> Optional[float]:
# Return close price, fallback to latest if not set
price = self.close_prices.get(symbol)
if price is None or price <= 0:
return self.latest_prices.get(symbol)
return price
def reset_open_prices(self):
# Don't clear prices - keep them for continuity
pass

View File

@@ -0,0 +1,299 @@
# -*- coding: utf-8 -*-
"""Ingest Polygon market data into the long-lived research warehouse."""
from __future__ import annotations
from datetime import datetime, timedelta, timezone
from typing import Iterable
from backend.data.market_store import MarketStore
from backend.data.news_alignment import align_news_for_symbol
from backend.data.provider_router import DataProviderRouter
from backend.data.polygon_client import (
fetch_news,
fetch_ohlc,
fetch_ticker_details,
)
from backend.data.provider_utils import normalize_symbol
def _today_utc() -> str:
return datetime.now(timezone.utc).date().isoformat()
def _default_start(years: int = 2) -> str:
return (datetime.now(timezone.utc).date() - timedelta(days=years * 366)).isoformat()
def _max_news_date(news_rows: Iterable[dict]) -> str | None:
dates = [
str(item.get("published_utc") or "").strip()[:10]
for item in news_rows
if str(item.get("published_utc") or "").strip()
]
dates = [value for value in dates if value]
return max(dates) if dates else None
def _effective_last_news_fetch(
market_store: MarketStore,
*,
ticker: str,
end_date: str,
watermark_value: str | None,
) -> str | None:
"""Clamp stale/future watermarks to the latest actually stored news date."""
raw = str(watermark_value or "").strip()[:10]
if not raw:
return None
if raw <= end_date:
return raw
latest_stored = market_store.get_latest_news_date(ticker)
if latest_stored and latest_stored <= end_date:
return latest_stored
return end_date
def _normalize_provider_news_rows(ticker: str, news_items: Iterable[Any]) -> list[dict]:
rows: list[dict] = []
for item in news_items:
payload = item.model_dump() if hasattr(item, "model_dump") else dict(item or {})
related = payload.get("related")
if isinstance(related, str):
related_list = [value.strip().upper() for value in related.split(",") if value.strip()]
elif isinstance(related, list):
related_list = [str(value).strip().upper() for value in related if str(value).strip()]
else:
related_list = []
if ticker not in related_list:
related_list.append(ticker)
rows.append(
{
"title": payload.get("title"),
"description": payload.get("summary"),
"summary": payload.get("summary"),
"article_url": payload.get("url"),
"published_utc": payload.get("date"),
"publisher": payload.get("source"),
"tickers": related_list,
"category": payload.get("category"),
"raw_json": payload,
}
)
return rows
def ingest_ticker_history(
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
store: MarketStore | None = None,
) -> dict:
"""Fetch and persist Polygon OHLC + news for a ticker."""
ticker = normalize_symbol(symbol)
start = start_date or _default_start()
end = end_date or _today_utc()
market_store = store or MarketStore()
details = fetch_ticker_details(ticker)
market_store.upsert_ticker(
symbol=ticker,
name=details.get("name"),
sector=details.get("sic_description"),
is_active=bool(details.get("active", True)),
)
ohlc_rows = fetch_ohlc(ticker, start, end)
news_rows = fetch_news(ticker, start, end)
price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon")
news_count = market_store.upsert_news(ticker, news_rows, source="polygon")
aligned_count = align_news_for_symbol(market_store, ticker)
market_store.update_fetch_watermark(
symbol=ticker,
price_date=end,
news_date=_max_news_date(news_rows),
)
return {
"symbol": ticker,
"start_date": start,
"end_date": end,
"prices": price_count,
"news": news_count,
"aligned": aligned_count,
}
def update_ticker_incremental(
symbol: str,
*,
end_date: str | None = None,
store: MarketStore | None = None,
) -> dict:
"""Incrementally fetch OHLC + news since the last watermark."""
ticker = normalize_symbol(symbol)
market_store = store or MarketStore()
watermarks = market_store.get_ticker_watermarks(ticker)
end = end_date or _today_utc()
start_prices = (
(datetime.fromisoformat(watermarks["last_price_fetch"]) + timedelta(days=1)).date().isoformat()
if watermarks.get("last_price_fetch")
else _default_start()
)
effective_last_news_fetch = _effective_last_news_fetch(
market_store,
ticker=ticker,
end_date=end,
watermark_value=watermarks.get("last_news_fetch"),
)
start_news = (
(datetime.fromisoformat(effective_last_news_fetch) + timedelta(days=1)).date().isoformat()
if effective_last_news_fetch
else _default_start()
)
details = fetch_ticker_details(ticker)
market_store.upsert_ticker(
symbol=ticker,
name=details.get("name"),
sector=details.get("sic_description"),
is_active=bool(details.get("active", True)),
)
ohlc_rows = [] if start_prices > end else fetch_ohlc(ticker, start_prices, end)
news_rows = [] if start_news > end else fetch_news(ticker, start_news, end)
price_count = market_store.upsert_ohlc(ticker, ohlc_rows, source="polygon") if ohlc_rows else 0
news_count = market_store.upsert_news(ticker, news_rows, source="polygon") if news_rows else 0
aligned_count = align_news_for_symbol(market_store, ticker)
market_store.update_fetch_watermark(
symbol=ticker,
price_date=end if ohlc_rows or watermarks.get("last_price_fetch") else None,
news_date=_max_news_date(news_rows),
)
return {
"symbol": ticker,
"start_price_date": start_prices,
"start_news_date": start_news,
"end_date": end,
"prices": price_count,
"news": news_count,
"aligned": aligned_count,
}
def refresh_news_incremental(
symbol: str,
*,
end_date: str | None = None,
store: MarketStore | None = None,
) -> dict:
"""Incrementally fetch company news using the configured provider router."""
ticker = normalize_symbol(symbol)
market_store = store or MarketStore()
watermarks = market_store.get_ticker_watermarks(ticker)
end = end_date or _today_utc()
effective_last_news_fetch = _effective_last_news_fetch(
market_store,
ticker=ticker,
end_date=end,
watermark_value=watermarks.get("last_news_fetch"),
)
start_news = (
(datetime.fromisoformat(effective_last_news_fetch) + timedelta(days=1)).date().isoformat()
if effective_last_news_fetch
else _default_start()
)
if start_news > end:
return {
"symbol": ticker,
"start_news_date": start_news,
"end_date": end,
"news": 0,
"aligned": 0,
}
router = DataProviderRouter()
news_items, source = router.get_company_news(
ticker=ticker,
start_date=start_news,
end_date=end,
limit=1000,
)
news_rows = _normalize_provider_news_rows(ticker, news_items)
news_count = market_store.upsert_news(ticker, news_rows, source=source) if news_rows else 0
aligned_count = align_news_for_symbol(market_store, ticker)
market_store.update_fetch_watermark(
symbol=ticker,
news_date=_max_news_date(news_rows),
)
return {
"symbol": ticker,
"start_news_date": start_news,
"end_date": end,
"news": news_count,
"aligned": aligned_count,
"source": source,
}
def refresh_news_for_symbols(
symbols: Iterable[str],
*,
end_date: str | None = None,
store: MarketStore | None = None,
) -> list[dict]:
"""Incrementally refresh company news for a list of tickers."""
market_store = store or MarketStore()
results = []
for symbol in symbols:
ticker = normalize_symbol(symbol)
if not ticker:
continue
results.append(
refresh_news_incremental(
ticker,
end_date=end_date,
store=market_store,
)
)
return results
def ingest_symbols(
symbols: Iterable[str],
*,
mode: str = "incremental",
start_date: str | None = None,
end_date: str | None = None,
store: MarketStore | None = None,
) -> list[dict]:
"""Fetch Polygon data for a list of tickers."""
market_store = store or MarketStore()
results = []
for symbol in symbols:
ticker = normalize_symbol(symbol)
if not ticker:
continue
if mode == "full":
results.append(
ingest_ticker_history(
ticker,
start_date=start_date,
end_date=end_date,
store=market_store,
)
)
else:
results.append(
update_ticker_incremental(
ticker,
end_date=end_date,
store=market_store,
)
)
return results

1106
backend/data/market_store.py Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
"""Align persisted news to the nearest NYSE trading date."""
from __future__ import annotations
from datetime import time
import pandas as pd
import pandas_market_calendars as mcal
from backend.data.market_store import MarketStore
NYSE_CALENDAR = mcal.get_calendar("NYSE")
def _next_trading_day(date_str: str) -> str:
start = pd.Timestamp(date_str).tz_localize(None)
sessions = NYSE_CALENDAR.valid_days(
start_date=(start - pd.Timedelta(days=1)).strftime("%Y-%m-%d"),
end_date=(start + pd.Timedelta(days=10)).strftime("%Y-%m-%d"),
)
future = [
pd.Timestamp(day).tz_localize(None).strftime("%Y-%m-%d")
for day in sessions
if pd.Timestamp(day).tz_localize(None) >= start
]
return future[0] if future else date_str
def resolve_trade_date(published_utc: str | None) -> str | None:
"""Map a published timestamp to an NYSE trade date."""
if not published_utc:
return None
timestamp = pd.to_datetime(published_utc, utc=True, errors="coerce")
if pd.isna(timestamp):
return None
nyse_time = timestamp.tz_convert("America/New_York")
candidate = nyse_time.date().isoformat()
valid_days = NYSE_CALENDAR.valid_days(start_date=candidate, end_date=candidate)
if len(valid_days) == 0:
return _next_trading_day(candidate)
if nyse_time.time() >= time(16, 0):
return _next_trading_day((nyse_time + pd.Timedelta(days=1)).date().isoformat())
return candidate
def align_news_for_symbol(store: MarketStore, symbol: str, *, limit: int = 5000) -> int:
"""Fill missing trade_date values for one ticker."""
pending = store.get_news_without_trade_date(symbol, limit=limit)
updates = []
for row in pending:
trade_date = resolve_trade_date(row.get("published_utc"))
if trade_date:
updates.append(
{
"news_id": row["news_id"],
"symbol": row["symbol"],
"trade_date": trade_date,
}
)
if not updates:
return 0
return store.set_trade_dates(updates)

View File

@@ -0,0 +1,292 @@
# -*- coding: utf-8 -*-
"""
Polling-based Price Manager with provider-aware quote polling.
Supports Finnhub and yfinance for near real-time price fetching.
"""
import logging
import threading
import time
from typing import Callable, Dict, List, Optional
import finnhub
import yfinance as yf
from backend.data.provider_utils import normalize_symbol
logger = logging.getLogger(__name__)
_SUPPRESSED_LOG_EVERY = 20
class PollingPriceManager:
"""Polling-based price manager using Finnhub or yfinance."""
def __init__(
self,
api_key: Optional[str] = None,
poll_interval: int = 30,
provider: str = "finnhub",
):
"""
Args:
api_key: Finnhub API Key
poll_interval: Polling interval in seconds (default 30s)
provider: Quote provider (`finnhub` or `yfinance`)
"""
self.api_key = api_key
self.poll_interval = poll_interval
self.provider = provider
self.finnhub_client = (
finnhub.Client(api_key=api_key)
if provider == "finnhub" and api_key
else None
)
self.subscribed_symbols: List[str] = []
self.latest_prices: Dict[str, float] = {}
self.open_prices: Dict[str, float] = {}
self.price_callbacks: List[Callable] = []
self._failure_counts: Dict[str, int] = {}
self.running = False
self._thread: Optional[threading.Thread] = None
logger.info(
"PollingPriceManager initialized "
f"(provider: {provider}, interval: {poll_interval}s)",
)
def subscribe(self, symbols: List[str]):
"""Subscribe to stock symbols"""
for symbol in symbols:
symbol = normalize_symbol(symbol)
if symbol not in self.subscribed_symbols:
self.subscribed_symbols.append(symbol)
logger.info(f"Subscribed to: {symbol}")
def unsubscribe(self, symbols: List[str]):
"""Unsubscribe from symbols"""
for symbol in symbols:
symbol = normalize_symbol(symbol)
if symbol in self.subscribed_symbols:
self.subscribed_symbols.remove(symbol)
logger.info(f"Unsubscribed: {symbol}")
def add_price_callback(self, callback: Callable):
"""Add price update callback"""
self.price_callbacks.append(callback)
def _fetch_prices(self):
"""Fetch latest prices for all subscribed stocks"""
for symbol in self.subscribed_symbols:
try:
quote_data = self._fetch_quote(symbol)
if not isinstance(quote_data, dict):
raise ValueError(f"{symbol}: Empty quote payload")
current_price = quote_data.get("c")
open_price = quote_data.get("o")
timestamp = quote_data.get("t", int(time.time()))
if not current_price or current_price <= 0:
logger.warning(f"{symbol}: Invalid price data")
continue
# Store open price on first fetch
if (
symbol not in self.open_prices
and open_price
and open_price > 0
):
self.open_prices[symbol] = open_price
logger.info(f"{symbol} open price: ${open_price:.2f}")
stored_open = self.open_prices.get(symbol, open_price)
ret = (
((current_price - stored_open) / stored_open) * 100
if stored_open > 0
else 0
)
self.latest_prices[symbol] = current_price
previous_failures = self._failure_counts.pop(symbol, 0)
if previous_failures > 0:
logger.info(
"%s quote polling recovered after %d consecutive failures",
symbol,
previous_failures,
)
price_data = {
"symbol": symbol,
"price": current_price,
"timestamp": timestamp * 1000,
"open": stored_open,
"high": quote_data.get("h"),
"low": quote_data.get("l"),
"previous_close": quote_data.get("pc"),
"ret": ret,
"change": quote_data.get("d"),
"change_percent": quote_data.get("dp"),
}
for callback in self.price_callbacks:
try:
callback(price_data)
except Exception as e:
logger.error(f"Price callback error ({symbol}): {e}")
logger.debug(
f"{symbol}: ${current_price:.2f} [ret: {ret:+.2f}%]",
)
except Exception as e:
failure_count = self._failure_counts.get(symbol, 0) + 1
self._failure_counts[symbol] = failure_count
message = f"Failed to fetch {symbol} price: {e}"
if failure_count == 1:
logger.warning(message)
elif failure_count % _SUPPRESSED_LOG_EVERY == 0:
logger.warning(
"%s (repeated %d times; suppressing intermediate failures)",
message,
failure_count,
)
else:
logger.debug(message)
def _fetch_quote(self, symbol: str) -> Dict[str, float]:
"""Fetch a normalized quote payload from the configured provider."""
if self.provider == "yfinance":
return self._fetch_yfinance_quote(symbol)
if not self.finnhub_client:
raise ValueError("Finnhub API key required for finnhub polling")
quote = self.finnhub_client.quote(symbol)
if not isinstance(quote, dict):
raise ValueError(f"{symbol}: Invalid Finnhub quote payload")
return quote
def _fetch_yfinance_quote(self, symbol: str) -> Dict[str, float]:
"""Fetch quote data from yfinance and normalize to Finnhub-like keys."""
ticker = yf.Ticker(symbol)
fast_info = dict(getattr(ticker, "fast_info", {}) or {})
current_price = _coerce_float(
fast_info.get("lastPrice") or fast_info.get("regularMarketPrice"),
)
open_price = _coerce_float(
fast_info.get("open") or fast_info.get("regularMarketOpen"),
)
previous_close = _coerce_float(
fast_info.get("previousClose")
or fast_info.get("regularMarketPreviousClose"),
)
high_price = _coerce_float(
fast_info.get("dayHigh") or fast_info.get("regularMarketDayHigh"),
)
low_price = _coerce_float(
fast_info.get("dayLow") or fast_info.get("regularMarketDayLow"),
)
if current_price is None:
history = ticker.history(period="1d", interval="1m", auto_adjust=False)
if history is None:
raise ValueError(f"{symbol}: yfinance returned no history frame")
if history.empty:
raise ValueError(f"{symbol}: No yfinance quote data")
latest = history.iloc[-1]
current_price = _coerce_float(latest.get("Close"))
open_price = open_price or _coerce_float(history.iloc[0].get("Open"))
high_price = high_price or _coerce_float(history["High"].max())
low_price = low_price or _coerce_float(history["Low"].min())
if current_price is None:
raise ValueError(f"{symbol}: Invalid yfinance quote data")
effective_open = open_price or previous_close or current_price
effective_prev_close = previous_close or effective_open or current_price
change = current_price - effective_prev_close
change_percent = (
(change / effective_prev_close) * 100 if effective_prev_close else 0.0
)
return {
"c": current_price,
"o": effective_open,
"h": high_price or max(current_price, effective_open),
"l": low_price or min(current_price, effective_open),
"pc": effective_prev_close,
"d": change,
"dp": change_percent,
"t": int(time.time()),
}
def _polling_loop(self):
"""Main polling loop"""
logger.info(f"Price polling started (interval: {self.poll_interval}s)")
while self.running:
try:
start_time = time.time()
self._fetch_prices()
elapsed = time.time() - start_time
sleep_time = max(0, self.poll_interval - elapsed)
if sleep_time > 0:
time.sleep(sleep_time)
except Exception as e:
logger.error(f"Polling loop error: {e}")
time.sleep(5)
def start(self):
"""Start price polling"""
if self.running:
logger.warning("Price polling already running")
return
if not self.subscribed_symbols:
logger.warning("No stocks subscribed")
return
self.running = True
self._thread = threading.Thread(target=self._polling_loop, daemon=True)
self._thread.start()
logger.info(
f"Price polling started: {', '.join(self.subscribed_symbols)}",
)
def stop(self):
"""Stop price polling"""
self.running = False
if self._thread:
self._thread.join(timeout=5)
logger.info("Price polling stopped")
def get_latest_price(self, symbol: str) -> Optional[float]:
"""Get latest price for symbol"""
return self.latest_prices.get(symbol)
def get_all_latest_prices(self) -> Dict[str, float]:
"""Get all latest prices"""
return self.latest_prices.copy()
def get_open_price(self, symbol: str) -> Optional[float]:
"""Get open price for symbol"""
return self.open_prices.get(symbol)
def reset_open_prices(self):
"""Reset open prices for new trading day"""
self.open_prices.clear()
logger.info("Open prices reset")
def _coerce_float(value) -> Optional[float]:
try:
if value is None:
return None
return float(value)
except (TypeError, ValueError):
return None

View File

@@ -0,0 +1,161 @@
# -*- coding: utf-8 -*-
"""Polygon client used for long-lived market research ingestion."""
from __future__ import annotations
import os
import time
from datetime import datetime, timezone
from typing import Any, Optional
import requests
BASE = "https://api.polygon.io"
def _headers() -> dict[str, str]:
api_key = os.getenv("POLYGON_API_KEY", "").strip()
if not api_key:
raise ValueError("Missing required API key: POLYGON_API_KEY")
return {"Authorization": f"Bearer {api_key}"}
def http_get(
url: str,
params: Optional[dict[str, Any]] = None,
*,
max_retries: int = 8,
backoff: float = 2.0,
) -> requests.Response:
"""HTTP GET with exponential backoff and 429 handling."""
for attempt in range(max_retries):
try:
response = requests.get(
url,
params=params or {},
headers=_headers(),
timeout=30,
)
except requests.RequestException:
time.sleep((backoff**attempt) + 0.5)
if attempt == max_retries - 1:
raise
continue
if response.status_code == 429:
retry_after = response.headers.get("Retry-After")
wait = (
float(retry_after)
if retry_after and retry_after.isdigit()
else min((backoff**attempt) + 1.0, 60.0)
)
time.sleep(wait)
if attempt == max_retries - 1:
response.raise_for_status()
continue
if 500 <= response.status_code < 600:
time.sleep(min((backoff**attempt) + 1.0, 60.0))
if attempt == max_retries - 1:
response.raise_for_status()
continue
response.raise_for_status()
return response
raise RuntimeError("Unreachable")
def fetch_ticker_details(symbol: str) -> dict[str, Any]:
"""Fetch company metadata from Polygon."""
response = http_get(f"{BASE}/v3/reference/tickers/{symbol}")
return response.json().get("results", {}) or {}
def fetch_ohlc(symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
"""Fetch daily OHLC data from Polygon."""
response = http_get(
f"{BASE}/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}",
params={"adjusted": "true", "sort": "asc", "limit": 50000},
)
results = response.json().get("results") or []
rows: list[dict[str, Any]] = []
for item in results:
rows.append(
{
"date": datetime.fromtimestamp(
int(item["t"]) / 1000,
tz=timezone.utc,
).date().isoformat(),
"open": item.get("o"),
"high": item.get("h"),
"low": item.get("l"),
"close": item.get("c"),
"volume": item.get("v"),
"vwap": item.get("vw"),
"transactions": item.get("n"),
}
)
return rows
def fetch_news(
symbol: str,
start_date: str,
end_date: str,
*,
per_page: int = 50,
page_sleep: float = 1.2,
max_pages: Optional[int] = None,
) -> list[dict[str, Any]]:
"""Fetch all Polygon news for a ticker, with pagination."""
url = f"{BASE}/v2/reference/news"
params = {
"ticker": symbol,
"published_utc.gte": start_date,
"published_utc.lte": end_date,
"limit": per_page,
"order": "asc",
}
next_url: Optional[str] = None
pages = 0
all_articles: list[dict[str, Any]] = []
seen_ids: set[str] = set()
while True:
response = http_get(next_url or url, params=None if next_url else params)
data = response.json()
results = data.get("results") or []
if not results:
break
for item in results:
article_id = item.get("id")
if article_id and article_id in seen_ids:
continue
all_articles.append(
{
"id": article_id,
"publisher": (item.get("publisher") or {}).get("name"),
"title": item.get("title"),
"author": item.get("author"),
"published_utc": item.get("published_utc"),
"amp_url": item.get("amp_url"),
"article_url": item.get("article_url"),
"tickers": item.get("tickers"),
"description": item.get("description"),
"insights": item.get("insights"),
}
)
if article_id:
seen_ids.add(article_id)
next_url = data.get("next_url")
pages += 1
if max_pages is not None and pages >= max_pages:
break
if not next_url:
break
time.sleep(page_sleep)
return all_articles

View File

@@ -0,0 +1,910 @@
# -*- coding: utf-8 -*-
"""Unified data provider router with fallback support."""
import datetime
import logging
from pathlib import Path
from typing import Callable, Optional
import finnhub
import pandas as pd
import yfinance as yf
from backend.config.data_config import DataSource, get_data_sources
from shared.schema import (
CompanyFactsResponse,
CompanyNews,
CompanyNewsResponse,
FinancialMetrics,
FinancialMetricsResponse,
InsiderTrade,
InsiderTradeResponse,
LineItem,
LineItemResponse,
Price,
PriceResponse,
)
logger = logging.getLogger(__name__)
_DATA_DIR = Path(__file__).parent / "ret_data"
def _format_provider_error(exc: Exception) -> str:
"""Condense common provider failures into short, readable messages."""
message = str(exc).strip().replace("\n", " ")
if "429" in message:
return "rate limit reached"
if "402" in message:
return "insufficient credits"
if "422" in message or "Missing parameters" in message:
return "invalid request parameters"
if "Quote not found" in message:
return "quote not found"
return message
def _has_valid_ticker(ticker: str) -> bool:
"""Return whether the normalized ticker is non-empty."""
return bool((ticker or "").strip())
class DataProviderRouter:
"""Route data requests across configured providers with fallbacks."""
def __init__(self):
self.sources = get_data_sources()
self._usage = {
"preferred": list(self.sources),
"last_success": {},
}
self._listeners: list[Callable[[dict], None]] = []
def price_sources(self) -> list[DataSource]:
"""Price lookup order, always allowing local CSV fallback."""
return self.sources
def api_sources(self) -> list[DataSource]:
"""Providers that can serve network-backed data."""
return [source for source in self.sources if source != "local_csv"]
def get_prices(
self,
ticker: str,
start_date: str,
end_date: str,
) -> tuple[list[Price], DataSource]:
"""Fetch prices using preferred providers with fallback."""
if not _has_valid_ticker(ticker):
return [], "local_csv"
last_error: Optional[Exception] = None
for source in self.price_sources():
try:
if source == "finnhub":
prices = _fetch_finnhub_prices(ticker, start_date, end_date)
self._record_success("prices", source)
return prices, source
if source == "financial_datasets":
prices = _fetch_fd_prices(ticker, start_date, end_date)
self._record_success("prices", source)
return prices, source
if source == "yfinance":
prices = _fetch_yfinance_prices(ticker, start_date, end_date)
self._record_success("prices", source)
return prices, source
prices = _fetch_local_prices(ticker, start_date, end_date)
if prices:
self._record_success("prices", source)
return prices, source
except Exception as exc:
last_error = exc
logger.warning(
"Price source %s failed for %s: %s",
source,
ticker,
_format_provider_error(exc),
)
if last_error:
raise last_error
return [], "local_csv"
def get_financial_metrics(
self,
ticker: str,
end_date: str,
period: str = "ttm",
limit: int = 10,
) -> tuple[list[FinancialMetrics], DataSource]:
"""Fetch financial metrics with API provider fallback."""
if not _has_valid_ticker(ticker):
return [], "local_csv"
last_error: Optional[Exception] = None
for source in self.api_sources():
try:
if source == "finnhub":
metrics = _fetch_finnhub_financial_metrics(
ticker,
end_date,
period,
)
self._record_success("financial_metrics", source)
return metrics, source
if source == "yfinance":
metrics = _fetch_yfinance_financial_metrics(
ticker,
end_date,
period,
)
self._record_success("financial_metrics", source)
return metrics, source
metrics = _fetch_fd_financial_metrics(
ticker,
end_date,
period,
limit,
)
self._record_success("financial_metrics", source)
return metrics, source
except Exception as exc:
last_error = exc
logger.warning(
"Financial metrics source %s failed for %s: %s",
source,
ticker,
_format_provider_error(exc),
)
if last_error:
raise last_error
return [], "local_csv"
def search_line_items(
self,
ticker: str,
line_items: list[str],
end_date: str,
period: str = "ttm",
limit: int = 10,
) -> list[LineItem]:
"""Line items are only supported via Financial Datasets."""
if not _has_valid_ticker(ticker):
return []
if "financial_datasets" not in self.api_sources():
return []
try:
results = _fetch_fd_line_items(
ticker=ticker,
line_items=line_items,
end_date=end_date,
period=period,
limit=limit,
)
self._record_success("line_items", "financial_datasets")
return results
except Exception as exc:
logger.warning(
"Line items source failed for %s: %s",
ticker,
_format_provider_error(exc),
)
return []
def get_insider_trades(
self,
ticker: str,
end_date: str,
start_date: Optional[str] = None,
limit: int = 1000,
) -> tuple[list[InsiderTrade], DataSource]:
"""Fetch insider trades with provider fallback."""
if not _has_valid_ticker(ticker):
return [], "local_csv"
last_error: Optional[Exception] = None
for source in self.api_sources():
try:
if source == "finnhub":
trades = _fetch_finnhub_insider_trades(
ticker,
start_date,
end_date,
limit,
)
self._record_success("insider_trades", source)
return trades, source
trades = _fetch_fd_insider_trades(
ticker,
start_date,
end_date,
limit,
)
self._record_success("insider_trades", source)
return trades, source
except Exception as exc:
last_error = exc
logger.warning(
"Insider trades source %s failed for %s: %s",
source,
ticker,
_format_provider_error(exc),
)
if last_error:
raise last_error
return [], "local_csv"
def get_company_news(
self,
ticker: str,
end_date: str,
start_date: Optional[str] = None,
limit: int = 1000,
) -> tuple[list[CompanyNews], DataSource]:
"""Fetch company news with provider fallback."""
if not _has_valid_ticker(ticker):
return [], "local_csv"
last_error: Optional[Exception] = None
for source in self.api_sources():
try:
if source == "finnhub":
news = _fetch_finnhub_company_news(
ticker,
start_date,
end_date,
limit,
)
self._record_success("company_news", source)
return news, source
if source == "yfinance":
news = _fetch_yfinance_company_news(
ticker,
start_date,
end_date,
limit,
)
self._record_success("company_news", source)
return news, source
news = _fetch_fd_company_news(
ticker,
start_date,
end_date,
limit,
)
self._record_success("company_news", source)
return news, source
except Exception as exc:
last_error = exc
logger.warning(
"Company news source %s failed for %s: %s",
source,
ticker,
_format_provider_error(exc),
)
if last_error:
raise last_error
return [], "local_csv"
def get_market_cap(
self,
ticker: str,
end_date: str,
metrics_lookup,
) -> tuple[Optional[float], DataSource]:
"""Fetch market cap using facts API or financial metrics fallback."""
if not _has_valid_ticker(ticker):
return None, "local_csv"
today = datetime.datetime.now().strftime("%Y-%m-%d")
if end_date == today and "financial_datasets" in self.api_sources():
try:
self._record_success("market_cap", "financial_datasets")
return _fetch_fd_market_cap_today(ticker), "financial_datasets"
except Exception as exc:
logger.warning(
"Market cap facts source failed for %s: %s",
ticker,
_format_provider_error(exc),
)
metrics, source = metrics_lookup(ticker, end_date)
if not metrics:
return None, source
market_cap = metrics[0].market_cap
if market_cap is None:
return None, source
if source == "finnhub":
self._record_success("market_cap", source)
return market_cap * 1_000_000, source
self._record_success("market_cap", source)
return market_cap, source
def get_usage_snapshot(self) -> dict:
"""Return provider usage metadata for UI/debugging."""
return {
"preferred": list(self._usage["preferred"]),
"last_success": dict(self._usage["last_success"]),
}
def add_listener(self, listener: Callable[[dict], None]) -> None:
"""Register a callback for provider usage changes."""
if listener not in self._listeners:
self._listeners.append(listener)
def remove_listener(self, listener: Callable[[dict], None]) -> None:
"""Remove a previously registered listener."""
if listener in self._listeners:
self._listeners.remove(listener)
def load_local_price_frame(
self,
ticker: str,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
) -> pd.DataFrame:
"""Load local CSV prices as a DataFrame for backtest managers."""
csv_path = _DATA_DIR / f"{ticker}.csv"
if not csv_path.exists():
return pd.DataFrame()
df = pd.read_csv(csv_path)
if df.empty or "time" not in df.columns:
return pd.DataFrame()
df["time"] = pd.to_datetime(df["time"])
if start_date:
df = df[df["time"] >= pd.to_datetime(start_date)]
if end_date:
df = df[df["time"] <= pd.to_datetime(end_date)]
if df.empty:
return pd.DataFrame()
df["Date"] = pd.to_datetime(df["time"])
df.set_index("Date", inplace=True)
df.sort_index(inplace=True)
self._record_success("historical_prices", "local_csv")
return df
def _record_success(self, data_type: str, source: DataSource) -> None:
previous = self._usage["last_success"].get(data_type)
self._usage["last_success"][data_type] = source
if previous != source:
snapshot = self.get_usage_snapshot()
for listener in list(self._listeners):
try:
listener(snapshot)
except Exception as exc:
logger.warning("Provider listener failed: %s", exc)
_router_instance: Optional[DataProviderRouter] = None
def get_provider_router() -> DataProviderRouter:
"""Return a shared provider router instance."""
global _router_instance
if _router_instance is None:
_router_instance = DataProviderRouter()
return _router_instance
def _get_finnhub_client() -> finnhub.Client:
api_key = _env_required("FINNHUB_API_KEY")
return finnhub.Client(api_key=api_key)
def _env_required(key: str) -> str:
import os
value = os.getenv(key, "").strip()
if not value:
raise ValueError(f"Missing required API key: {key}")
return value
def _make_api_request(url: str, headers: dict, method: str = "GET", json_data: dict = None):
import requests
response = (
requests.post(url, headers=headers, json=json_data)
if method.upper() == "POST"
else requests.get(url, headers=headers)
)
if response.status_code != 200:
raise ValueError(f"{response.status_code} - {response.text}")
return response
def _fetch_local_prices(
ticker: str,
start_date: str,
end_date: str,
) -> list[Price]:
csv_path = _DATA_DIR / f"{ticker}.csv"
if not csv_path.exists():
return []
df = pd.read_csv(csv_path)
if df.empty or "time" not in df.columns:
return []
df["time"] = pd.to_datetime(df["time"])
start = pd.to_datetime(start_date)
end = pd.to_datetime(end_date)
df = df[(df["time"] >= start) & (df["time"] <= end)].copy()
if df.empty:
return []
return [
Price(
open=float(row["open"]),
close=float(row["close"]),
high=float(row["high"]),
low=float(row["low"]),
volume=int(float(row["volume"])),
time=row["time"].strftime("%Y-%m-%d"),
)
for _, row in df.iterrows()
]
def _fetch_finnhub_prices(
ticker: str,
start_date: str,
end_date: str,
) -> list[Price]:
client = _get_finnhub_client()
start_timestamp = int(
datetime.datetime.strptime(start_date, "%Y-%m-%d").timestamp(),
)
end_timestamp = int(
(
datetime.datetime.strptime(end_date, "%Y-%m-%d")
+ datetime.timedelta(days=1)
).timestamp(),
)
candles = client.stock_candles(ticker, "D", start_timestamp, end_timestamp)
return [
Price(
open=candles["o"][i],
close=candles["c"][i],
high=candles["h"][i],
low=candles["l"][i],
volume=int(candles["v"][i]),
time=datetime.datetime.fromtimestamp(candles["t"][i]).strftime(
"%Y-%m-%d",
),
)
for i in range(len(candles.get("t", [])))
]
def _fetch_yfinance_prices(
ticker: str,
start_date: str,
end_date: str,
) -> list[Price]:
history = yf.Ticker(ticker).history(
start=start_date,
end=(
datetime.datetime.strptime(end_date, "%Y-%m-%d")
+ datetime.timedelta(days=1)
).strftime("%Y-%m-%d"),
auto_adjust=False,
actions=False,
)
if history.empty:
return []
history = history.reset_index()
date_column = "Date" if "Date" in history.columns else history.columns[0]
return [
Price(
open=float(row["Open"]),
close=float(row["Close"]),
high=float(row["High"]),
low=float(row["Low"]),
volume=int(float(row["Volume"])),
time=pd.to_datetime(row[date_column]).strftime("%Y-%m-%d"),
)
for _, row in history.iterrows()
]
def _fetch_fd_prices(
ticker: str,
start_date: str,
end_date: str,
) -> list[Price]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
url = (
"https://api.financialdatasets.ai/prices/"
f"?ticker={ticker}&interval=day&interval_multiplier=1"
f"&start_date={start_date}&end_date={end_date}"
)
response = _make_api_request(url, headers)
return PriceResponse(**response.json()).prices
def _fetch_finnhub_financial_metrics(
ticker: str,
end_date: str,
period: str,
) -> list[FinancialMetrics]:
client = _get_finnhub_client()
financials = client.company_basic_financials(ticker, "all")
metric_data = financials.get("metric", {})
if not metric_data:
return []
return [_map_finnhub_metrics(ticker, end_date, period, metric_data)]
def _fetch_fd_financial_metrics(
ticker: str,
end_date: str,
period: str,
limit: int,
) -> list[FinancialMetrics]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
url = (
"https://api.financialdatasets.ai/financial-metrics/"
f"?ticker={ticker}&report_period_lte={end_date}&limit={limit}&period={period}"
)
response = _make_api_request(url, headers)
return FinancialMetricsResponse(**response.json()).financial_metrics
def _fetch_yfinance_financial_metrics(
ticker: str,
end_date: str,
period: str,
) -> list[FinancialMetrics]:
info = yf.Ticker(ticker).info or {}
shares_outstanding = _coerce_float(info.get("sharesOutstanding"))
free_cashflow = _coerce_float(info.get("freeCashflow"))
return [
FinancialMetrics(
ticker=ticker,
report_period=end_date,
period=period,
currency=str(info.get("currency") or "USD"),
market_cap=_coerce_float(info.get("marketCap")),
enterprise_value=_coerce_float(info.get("enterpriseValue")),
price_to_earnings_ratio=_coerce_float(info.get("trailingPE")),
price_to_book_ratio=_coerce_float(info.get("priceToBook")),
price_to_sales_ratio=_coerce_float(
info.get("priceToSalesTrailing12Months"),
),
enterprise_value_to_ebitda_ratio=_coerce_float(
info.get("enterpriseToEbitda"),
),
enterprise_value_to_revenue_ratio=_coerce_float(
info.get("enterpriseToRevenue"),
),
free_cash_flow_yield=_ratio_or_none(free_cashflow, info.get("marketCap")),
peg_ratio=_coerce_float(info.get("pegRatio")),
gross_margin=_coerce_float(info.get("grossMargins")),
operating_margin=_coerce_float(info.get("operatingMargins")),
net_margin=_coerce_float(info.get("profitMargins")),
return_on_equity=_coerce_float(info.get("returnOnEquity")),
return_on_assets=_coerce_float(info.get("returnOnAssets")),
return_on_invested_capital=None,
asset_turnover=None,
inventory_turnover=None,
receivables_turnover=None,
days_sales_outstanding=None,
operating_cycle=None,
working_capital_turnover=None,
current_ratio=_coerce_float(info.get("currentRatio")),
quick_ratio=_coerce_float(info.get("quickRatio")),
cash_ratio=None,
operating_cash_flow_ratio=None,
debt_to_equity=_coerce_float(info.get("debtToEquity")),
debt_to_assets=None,
interest_coverage=None,
revenue_growth=_coerce_float(info.get("revenueGrowth")),
earnings_growth=_coerce_float(
info.get("earningsGrowth") or info.get("earningsQuarterlyGrowth"),
),
book_value_growth=None,
earnings_per_share_growth=_coerce_float(
info.get("earningsQuarterlyGrowth"),
),
free_cash_flow_growth=None,
operating_income_growth=None,
ebitda_growth=None,
payout_ratio=_coerce_float(info.get("payoutRatio")),
earnings_per_share=_coerce_float(info.get("trailingEps")),
book_value_per_share=_coerce_float(info.get("bookValue")),
free_cash_flow_per_share=_ratio_or_none(free_cashflow, shares_outstanding),
),
]
def _fetch_fd_line_items(
ticker: str,
line_items: list[str],
end_date: str,
period: str,
limit: int,
) -> list[LineItem]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
body = {
"tickers": [ticker],
"line_items": line_items,
"end_date": end_date,
"period": period,
"limit": limit,
}
response = _make_api_request(
"https://api.financialdatasets.ai/financials/search/line-items",
headers,
method="POST",
json_data=body,
)
return LineItemResponse(**response.json()).search_results[:limit]
def _fetch_finnhub_insider_trades(
ticker: str,
start_date: Optional[str],
end_date: str,
limit: int,
) -> list[InsiderTrade]:
client = _get_finnhub_client()
from_date = start_date or (
datetime.datetime.strptime(end_date, "%Y-%m-%d")
- datetime.timedelta(days=365)
).strftime("%Y-%m-%d")
insider_data = client.stock_insider_transactions(ticker, from_date, end_date)
return [
_convert_finnhub_insider_trade(ticker, trade)
for trade in insider_data.get("data", [])[:limit]
]
def _fetch_yfinance_company_news(
ticker: str,
start_date: Optional[str],
end_date: str,
limit: int,
) -> list[CompanyNews]:
news_items = getattr(yf.Ticker(ticker), "news", None) or []
start_bound = _normalize_timestamp(pd.to_datetime(start_date)) if start_date else None
end_bound = _normalize_timestamp(pd.to_datetime(end_date))
results: list[CompanyNews] = []
for item in news_items:
content = item.get("content", item)
published = (
content.get("pubDate")
or content.get("displayTime")
or item.get("providerPublishTime")
)
published_dt = _normalize_timestamp(_parse_news_datetime(published))
if published_dt is not None and published_dt > end_bound:
continue
if start_bound is not None and published_dt is not None and published_dt < start_bound:
continue
url = (
_nested_get(content, "canonicalUrl", "url")
or content.get("clickThroughUrl")
or content.get("url")
or item.get("link")
)
title = content.get("title") or item.get("title")
if not title or not url:
continue
results.append(
CompanyNews(
category=content.get("contentType") or item.get("type"),
ticker=ticker,
title=title,
related=item.get("relatedTickers", [ticker])[0]
if item.get("relatedTickers")
else ticker,
source=_nested_get(content, "provider", "displayName")
or item.get("publisher")
or "Yahoo Finance",
date=published_dt.strftime("%Y-%m-%d") if published_dt else None,
url=url,
summary=content.get("summary") or item.get("summary"),
),
)
if len(results) >= limit:
break
return results
def _map_finnhub_metrics(
ticker: str,
end_date: str,
period: str,
metric_data: dict,
) -> FinancialMetrics:
"""Map Finnhub metric data to FinancialMetrics model."""
return FinancialMetrics(
ticker=ticker,
report_period=end_date,
period=period,
currency="USD",
market_cap=metric_data.get("marketCapitalization"),
enterprise_value=None,
price_to_earnings_ratio=metric_data.get("peBasicExclExtraTTM"),
price_to_book_ratio=metric_data.get("pbAnnual"),
price_to_sales_ratio=metric_data.get("psAnnual"),
enterprise_value_to_ebitda_ratio=None,
enterprise_value_to_revenue_ratio=None,
free_cash_flow_yield=None,
peg_ratio=None,
gross_margin=metric_data.get("grossMarginTTM"),
operating_margin=metric_data.get("operatingMarginTTM"),
net_margin=metric_data.get("netProfitMarginTTM"),
return_on_equity=metric_data.get("roeTTM"),
return_on_assets=metric_data.get("roaTTM"),
return_on_invested_capital=metric_data.get("roicTTM"),
asset_turnover=metric_data.get("assetTurnoverTTM"),
inventory_turnover=metric_data.get("inventoryTurnoverTTM"),
receivables_turnover=metric_data.get("receivablesTurnoverTTM"),
days_sales_outstanding=None,
operating_cycle=None,
working_capital_turnover=None,
current_ratio=metric_data.get("currentRatioAnnual"),
quick_ratio=metric_data.get("quickRatioAnnual"),
cash_ratio=None,
operating_cash_flow_ratio=None,
debt_to_equity=metric_data.get("totalDebt/totalEquityAnnual"),
debt_to_assets=None,
interest_coverage=None,
revenue_growth=metric_data.get("revenueGrowthTTMYoy"),
earnings_growth=None,
book_value_growth=None,
earnings_per_share_growth=metric_data.get("epsGrowthTTMYoy"),
free_cash_flow_growth=None,
operating_income_growth=None,
ebitda_growth=None,
payout_ratio=metric_data.get("payoutRatioAnnual"),
earnings_per_share=metric_data.get("epsBasicExclExtraItemsTTM"),
book_value_per_share=metric_data.get("bookValuePerShareAnnual"),
free_cash_flow_per_share=None,
)
def _coerce_float(value) -> Optional[float]:
try:
if value is None:
return None
return float(value)
except (TypeError, ValueError):
return None
def _ratio_or_none(numerator, denominator) -> Optional[float]:
top = _coerce_float(numerator)
bottom = _coerce_float(denominator)
if top is None or bottom in (None, 0.0):
return None
return top / bottom
def _nested_get(payload: dict, *keys: str):
current = payload
for key in keys:
if not isinstance(current, dict):
return None
current = current.get(key)
return current
def _parse_news_datetime(value) -> Optional[pd.Timestamp]:
if value is None:
return None
try:
if isinstance(value, (int, float)):
return pd.to_datetime(int(value), unit="s")
return pd.to_datetime(value)
except (TypeError, ValueError):
return None
def _normalize_timestamp(value: Optional[pd.Timestamp]) -> Optional[pd.Timestamp]:
if value is None:
return None
if value.tzinfo is not None:
return value.tz_convert(None)
return value
def _convert_finnhub_insider_trade(ticker: str, trade: dict) -> InsiderTrade:
"""Convert Finnhub insider trade format to InsiderTrade model."""
shares_after = trade.get("share", 0)
change = trade.get("change", 0)
return InsiderTrade(
ticker=ticker,
issuer=None,
name=trade.get("name", ""),
title=None,
is_board_director=None,
transaction_date=trade.get("transactionDate", ""),
transaction_shares=abs(change),
transaction_price_per_share=trade.get("transactionPrice", 0.0),
transaction_value=abs(change) * trade.get("transactionPrice", 0.0),
shares_owned_before_transaction=(
shares_after - change if shares_after and change else None
),
shares_owned_after_transaction=float(shares_after)
if shares_after
else None,
security_title=None,
filing_date=trade.get("filingDate", ""),
)
def _fetch_fd_insider_trades(
ticker: str,
start_date: Optional[str],
end_date: str,
limit: int,
) -> list[InsiderTrade]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
url = f"https://api.financialdatasets.ai/insider-trades/?ticker={ticker}&filing_date_lte={end_date}"
if start_date:
url += f"&filing_date_gte={start_date}"
url += f"&limit={limit}"
response = _make_api_request(url, headers)
return InsiderTradeResponse(**response.json()).insider_trades
def _fetch_finnhub_company_news(
ticker: str,
start_date: Optional[str],
end_date: str,
limit: int,
) -> list[CompanyNews]:
client = _get_finnhub_client()
from_date = start_date or (
datetime.datetime.strptime(end_date, "%Y-%m-%d")
- datetime.timedelta(days=30)
).strftime("%Y-%m-%d")
news_data = client.company_news(ticker, _from=from_date, to=end_date)
return [
CompanyNews(
ticker=ticker,
title=news_item.get("headline", ""),
related=news_item.get("related", ""),
source=news_item.get("source", ""),
date=(
datetime.datetime.fromtimestamp(
news_item.get("datetime", 0),
datetime.timezone.utc,
).strftime("%Y-%m-%d")
if news_item.get("datetime")
else None
),
url=news_item.get("url", ""),
summary=news_item.get("summary", ""),
category=news_item.get("category", ""),
)
for news_item in news_data[:limit]
]
def _fetch_fd_company_news(
ticker: str,
start_date: Optional[str],
end_date: str,
limit: int,
) -> list[CompanyNews]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
url = f"https://api.financialdatasets.ai/news/?ticker={ticker}&end_date={end_date}&limit={limit}"
if start_date:
url += f"&start_date={start_date}"
response = _make_api_request(url, headers)
return CompanyNewsResponse(**response.json()).news
def _fetch_fd_market_cap_today(ticker: str) -> Optional[float]:
headers = {"X-API-KEY": _env_required("FINANCIAL_DATASETS_API_KEY")}
url = f"https://api.financialdatasets.ai/company/facts/?ticker={ticker}"
response = _make_api_request(url, headers)
return CompanyFactsResponse(**response.json()).company_facts.market_cap

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
"""Shared market symbol normalization helpers."""
from dataclasses import dataclass
@dataclass(frozen=True)
class MarketSymbol:
"""Normalized symbol metadata."""
raw: str
canonical: str
market: str
def canonical_symbol(symbol: str) -> str:
"""Return canonical uppercase symbol for storage and routing."""
return (symbol or "").strip().upper()
def normalize_symbol(symbol: str) -> str:
"""
Normalize symbols across US and exchange-prefixed formats.
Examples:
- sh600519 -> 600519
- 600519.SH -> 600519
- aapl -> AAPL
- hk00700 -> HK00700
"""
canonical = canonical_symbol(symbol)
if canonical.startswith(("SH", "SZ", "BJ")) and len(canonical) > 2:
candidate = canonical[2:]
if candidate.isdigit() and len(candidate) in (5, 6):
return candidate
if "." in canonical:
base, suffix = canonical.rsplit(".", 1)
if suffix in {"SH", "SZ", "SS", "BJ"} and base.isdigit():
return base
return canonical
def detect_market(symbol: str) -> str:
"""Infer market tag from normalized symbol."""
normalized = normalize_symbol(symbol)
if normalized.startswith("HK") or (
normalized.isdigit() and len(normalized) == 5
):
return "hk"
if normalized.isalpha() or (
"/" not in normalized and not normalized.isdigit()
):
return "us"
return "cn"
def describe_symbol(symbol: str) -> MarketSymbol:
"""Return normalized symbol metadata."""
normalized = normalize_symbol(symbol)
return MarketSymbol(
raw=symbol,
canonical=normalized,
market=detect_market(normalized),
)

View File

@@ -0,0 +1,387 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Automatic Incremental Historical Data Update Module
Features:
1. Fetch stock historical data from configured API (Finnhub or Financial Datasets)
2. Incrementally update CSV files in ret_data directory
3. Automatically detect last update date, only download new data
4. Calculate returns (ret)
5. Support batch updates for multiple stocks
"""
# flake8: noqa: E501
import logging
import os
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import exchange_calendars as xcals
import pandas as pd
import pandas_market_calendars as mcal
from dotenv import load_dotenv
from backend.config.data_config import (
get_config,
)
from backend.tools.data_tools import get_prices, prices_to_df
# Add project root directory to path
BASE_DIR = Path(__file__).resolve().parents[2]
if str(BASE_DIR) not in sys.path:
sys.path.append(str(BASE_DIR))
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
logger = logging.getLogger(__name__)
class DataUpdater:
"""Data updater"""
data_dir: Path
def __init__(
self,
data_dir: str = None,
start_date: str = "2022-01-01",
):
"""
Initialize data updater
Args:
data_dir: Data storage directory, defaults to backend/data/ret_data
start_date: Historical data start date (YYYY-MM-DD)
"""
# Get config from centralized source
config = get_config()
self.data_source = config.source
self.api_key = config.api_key
# Set data directory
if data_dir is None:
self.data_dir = BASE_DIR / "backend" / "data" / "ret_data"
else:
self.data_dir = Path(data_dir)
# Ensure directory exists
self.data_dir.mkdir(parents=True, exist_ok=True)
self.start_date = start_date
# Initialize Finnhub client if needed
if self.data_source == "finnhub":
import finnhub
self.client = finnhub.Client(api_key=self.api_key)
logger.info("Finnhub client initialized")
else:
self.client = None
logger.info("Financial Datasets API configured")
def get_trading_dates(self, start_date: str, end_date: str) -> List[str]:
"""Get US stock market trading date sequence."""
try:
if mcal is not None:
nyse = mcal.get_calendar("NYSE")
trading_dates = nyse.valid_days(
start_date=start_date,
end_date=end_date,
)
return [date.strftime("%Y-%m-%d") for date in trading_dates]
elif xcals is not None:
nyse = xcals.get_calendar("XNYS")
trading_dates = nyse.sessions_in_range(start_date, end_date)
return [date.strftime("%Y-%m-%d") for date in trading_dates]
except Exception as e:
logger.warning(
f"Failed to get US trading calendar, using business days: {e}",
)
# Fallback to simple business day method
date_range = pd.date_range(start_date, end_date, freq="B")
return [date.strftime("%Y-%m-%d") for date in date_range]
def get_last_date_from_csv(self, ticker: str) -> Optional[datetime]:
"""Get last data date from CSV file."""
csv_path = self.data_dir / f"{ticker}.csv"
if not csv_path.exists():
logger.info(f"{ticker}.csv does not exist, will create new file")
return None
try:
df = pd.read_csv(csv_path)
if df.empty or "time" not in df.columns:
return None
last_date_str = df["time"].iloc[-1]
last_date = datetime.strptime(last_date_str, "%Y-%m-%d")
logger.info(f"{ticker} last data date: {last_date_str}")
return last_date
except Exception as e:
logger.warning(f"Failed to read {ticker}.csv: {e}")
return None
def fetch_data_from_api(
self,
ticker: str,
start_date: datetime,
end_date: datetime,
) -> Optional[pd.DataFrame]:
"""Fetch data from configured API."""
start_date_str = start_date.strftime("%Y-%m-%d")
end_date_str = end_date.strftime("%Y-%m-%d")
logger.info(
f"Fetching {ticker} data from {self.data_source}: {start_date_str} to {end_date_str}",
)
prices = get_prices(
ticker=ticker,
start_date=start_date_str,
end_date=end_date_str,
)
if not prices:
logger.warning(f"{ticker} no data returned from API")
return None
# Convert to DataFrame
df = prices_to_df(prices)
df = df.reset_index()
df["time"] = df["Date"].dt.strftime("%Y-%m-%d")
# Calculate returns (next day return)
df["ret"] = df["close"].pct_change().shift(-1)
# Select needed columns
df = df[["open", "close", "high", "low", "volume", "time", "ret"]]
logger.info(f"Successfully fetched {ticker} data: {len(df)} records")
return df
def merge_and_save(self, ticker: str, new_data: pd.DataFrame) -> bool:
"""Merge old and new data and save."""
csv_path = self.data_dir / f"{ticker}.csv"
try:
if csv_path.exists():
old_data = pd.read_csv(csv_path)
logger.info(f"{ticker} existing data: {len(old_data)} records")
# Merge and deduplicate
combined = pd.concat([old_data, new_data], ignore_index=True)
combined = combined.drop_duplicates(
subset=["time"],
keep="last",
)
combined = combined.sort_values("time").reset_index(drop=True)
# Recalculate returns
combined["ret"] = combined["close"].pct_change().shift(-1)
logger.info(f"{ticker} merged data: {len(combined)} records")
else:
combined = new_data
logger.info(f"{ticker} new file: {len(combined)} records")
combined.to_csv(csv_path, index=False)
logger.info(f"{ticker} data saved to: {csv_path}")
return True
except Exception as e:
logger.error(f"Failed to save {ticker} data: {e}")
return False
def update_ticker(
self,
ticker: str,
force_full_update: bool = False,
) -> bool:
"""Update data for a single stock."""
logger.info(f"{'='*60}")
logger.info(f"Starting update for {ticker}")
logger.info(f"{'='*60}")
# Determine start date
if force_full_update:
start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
logger.info(f"Force full update, start date: {start_date.date()}")
else:
last_date = self.get_last_date_from_csv(ticker)
if last_date:
start_date = last_date + timedelta(days=1)
logger.info(
f"Incremental update, start date: {start_date.date()}",
)
else:
start_date = datetime.strptime(self.start_date, "%Y-%m-%d")
logger.info(f"First update, start date: {start_date.date()}")
end_date = datetime.now()
if start_date.date() >= end_date.date():
logger.info(f"{ticker} data is up to date, no update needed")
return True
new_data = self.fetch_data_from_api(ticker, start_date, end_date)
if new_data is None or new_data.empty:
days_diff = (end_date - start_date).days
if days_diff <= 3:
logger.info(
f"{ticker} has no new data (may be weekend/holiday)",
)
return True
else:
logger.warning(f"{ticker} has no new data")
return False
success = self.merge_and_save(ticker, new_data)
if success:
logger.info(f"{ticker} update completed")
else:
logger.error(f"{ticker} update failed")
return success
def update_all_tickers(
self,
tickers: List[str],
force_full_update: bool = False,
) -> Dict[str, bool]:
"""Batch update multiple stocks."""
results = {}
logger.info(f"{'='*60}")
logger.info(f"Starting batch update for {len(tickers)} stocks")
logger.info(f"Stock list: {', '.join(tickers)}")
logger.info(f"{'='*60}")
for i, ticker in enumerate(tickers, 1):
logger.info(f"[{i}/{len(tickers)}] Processing {ticker}")
results[ticker] = self.update_ticker(ticker, force_full_update)
# API rate limiting
if i < len(tickers):
time.sleep(1)
# Print summary
logger.info(f"{'='*60}")
logger.info("Update Summary")
logger.info(f"{'='*60}")
success_count = sum(results.values())
fail_count = len(results) - success_count
logger.info(f"Success: {success_count}")
logger.info(f"Failed: {fail_count}")
if fail_count > 0:
failed_tickers = [t for t, s in results.items() if not s]
logger.warning(f"Failed stocks: {', '.join(failed_tickers)}")
logger.info(f"{'='*60}\n")
return results
def main():
"""Command line entry point"""
import argparse
parser = argparse.ArgumentParser(
description="Automatically update stock historical data",
)
parser.add_argument(
"--tickers",
type=str,
help="Stock ticker list (comma-separated), e.g.: AAPL,MSFT,GOOGL",
)
parser.add_argument(
"--data-dir",
type=str,
help="Data storage directory (default: backend/data/ret_data)",
)
parser.add_argument(
"--start-date",
type=str,
default="2022-01-01",
help="Historical data start date (YYYY-MM-DD, default: 2022-01-01)",
)
parser.add_argument(
"--force",
action="store_true",
help="Force full update (re-download all data)",
)
args = parser.parse_args()
# Load environment variables
load_dotenv()
# Validate API key is available
try:
config = get_config()
logger.info(f"Using data source: {config.source}")
except ValueError as e:
logger.error(str(e))
sys.exit(1)
# Get stock list
if args.tickers:
tickers = [t.strip().upper() for t in args.tickers.split(",")]
else:
tickers_env = os.getenv("TICKERS", "")
if tickers_env:
tickers = [t.strip().upper() for t in tickers_env.split(",")]
else:
logger.error("Stock list not provided")
logger.error(
"Please set via --tickers parameter or TICKERS environment variable",
)
sys.exit(1)
# Create updater
updater = DataUpdater(
data_dir=args.data_dir,
start_date=args.start_date,
)
# Execute update
try:
results = updater.update_all_tickers(
tickers,
force_full_update=args.force,
)
except Exception:
# API error (e.g., weekend/holiday with no data)
sys.exit(1)
# Return status code
success_count = sum(results.values())
if success_count == len(results):
logger.info("All stocks updated successfully!")
sys.exit(0)
elif success_count == 0:
logger.warning("All stocks have no new data (may be weekend/holiday)")
sys.exit(0)
else:
logger.warning("Some stocks failed to update, but will continue")
sys.exit(0)
if __name__ == "__main__":
main()

50
backend/data/schema.py Normal file
View File

@@ -0,0 +1,50 @@
# -*- coding: utf-8 -*-
"""Compatibility schema bridge.
This module preserves the legacy ``backend.data.schema`` import path while
delegating the actual schema definitions to ``shared.schema``. Keeping one
canonical DTO set avoids drift as the monolith is split into service-specific
packages.
"""
from shared.schema import (
AgentStateData,
AgentStateMetadata,
AnalystSignal,
CompanyFacts,
CompanyFactsResponse,
CompanyNews,
CompanyNewsResponse,
FinancialMetrics,
FinancialMetricsResponse,
InsiderTrade,
InsiderTradeResponse,
LineItem,
LineItemResponse,
Portfolio,
Position,
Price,
PriceResponse,
TickerAnalysis,
)
__all__ = [
"Price",
"PriceResponse",
"FinancialMetrics",
"FinancialMetricsResponse",
"LineItem",
"LineItemResponse",
"InsiderTrade",
"InsiderTradeResponse",
"CompanyNews",
"CompanyNewsResponse",
"CompanyFacts",
"CompanyFactsResponse",
"Position",
"Portfolio",
"AnalystSignal",
"TickerAnalysis",
"AgentStateData",
"AgentStateMetadata",
]