evotraders/backend/data/polygon_client.py

# -*- coding: utf-8 -*-
"""Polygon client used for long-lived market research ingestion."""

from __future__ import annotations

import os
import time
from datetime import datetime, timezone
from typing import Any, Optional

import requests


BASE = "https://api.polygon.io"


def _headers() -> dict[str, str]:
    api_key = os.getenv("POLYGON_API_KEY", "").strip()
    if not api_key:
        raise ValueError("Missing required API key: POLYGON_API_KEY")
    return {"Authorization": f"Bearer {api_key}"}


def http_get(
    url: str,
    params: Optional[dict[str, Any]] = None,
    *,
    max_retries: int = 8,
    backoff: float = 2.0,
) -> requests.Response:
    """HTTP GET with exponential backoff and 429 handling."""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                url,
                params=params or {},
                headers=_headers(),
                timeout=30,
            )
        except requests.RequestException:
            time.sleep((backoff**attempt) + 0.5)
            if attempt == max_retries - 1:
                raise
            continue

        if response.status_code == 429:
            retry_after = response.headers.get("Retry-After")
            wait = (
                float(retry_after)
                if retry_after and retry_after.isdigit()
                else min((backoff**attempt) + 1.0, 60.0)
            )
            time.sleep(wait)
            if attempt == max_retries - 1:
                response.raise_for_status()
            continue

        if 500 <= response.status_code < 600:
            time.sleep(min((backoff**attempt) + 1.0, 60.0))
            if attempt == max_retries - 1:
                response.raise_for_status()
            continue

        response.raise_for_status()
        return response
    raise RuntimeError("Unreachable")


def fetch_ticker_details(symbol: str) -> dict[str, Any]:
    """Fetch company metadata from Polygon."""
    response = http_get(f"{BASE}/v3/reference/tickers/{symbol}")
    return response.json().get("results", {}) or {}


def fetch_ohlc(symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
    """Fetch daily OHLC data from Polygon."""
    response = http_get(
        f"{BASE}/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}",
        params={"adjusted": "true", "sort": "asc", "limit": 50000},
    )
    results = response.json().get("results") or []
    rows: list[dict[str, Any]] = []
    for item in results:
        rows.append(
            {
                "date": datetime.fromtimestamp(
                    int(item["t"]) / 1000,
                    tz=timezone.utc,
                ).date().isoformat(),
                "open": item.get("o"),
                "high": item.get("h"),
                "low": item.get("l"),
                "close": item.get("c"),
                "volume": item.get("v"),
                "vwap": item.get("vw"),
                "transactions": item.get("n"),
            }
        )
    return rows


def fetch_news(
    symbol: str,
    start_date: str,
    end_date: str,
    *,
    per_page: int = 50,
    page_sleep: float = 1.2,
    max_pages: Optional[int] = None,
) -> list[dict[str, Any]]:
    """Fetch all Polygon news for a ticker, with pagination."""
    url = f"{BASE}/v2/reference/news"
    params = {
        "ticker": symbol,
        "published_utc.gte": start_date,
        "published_utc.lte": end_date,
        "limit": per_page,
        "order": "asc",
    }
    next_url: Optional[str] = None
    pages = 0
    all_articles: list[dict[str, Any]] = []
    seen_ids: set[str] = set()

    while True:
        response = http_get(next_url or url, params=None if next_url else params)
        data = response.json()
        results = data.get("results") or []
        if not results:
            break

        for item in results:
            article_id = item.get("id")
            if article_id and article_id in seen_ids:
                continue
            all_articles.append(
                {
                    "id": article_id,
                    "publisher": (item.get("publisher") or {}).get("name"),
                    "title": item.get("title"),
                    "author": item.get("author"),
                    "published_utc": item.get("published_utc"),
                    "amp_url": item.get("amp_url"),
                    "article_url": item.get("article_url"),
                    "tickers": item.get("tickers"),
                    "description": item.get("description"),
                    "insights": item.get("insights"),
                }
            )
            if article_id:
                seen_ids.add(article_id)

        next_url = data.get("next_url")
        pages += 1
        if max_pages is not None and pages >= max_pages:
            break
        if not next_url:
            break
        time.sleep(page_sleep)

    return all_articles