162 lines
4.8 KiB
Python
162 lines
4.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Polygon client used for long-lived market research ingestion."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Any, Optional
|
|
|
|
import requests
|
|
|
|
|
|
BASE = "https://api.polygon.io"
|
|
|
|
|
|
def _headers() -> dict[str, str]:
|
|
api_key = os.getenv("POLYGON_API_KEY", "").strip()
|
|
if not api_key:
|
|
raise ValueError("Missing required API key: POLYGON_API_KEY")
|
|
return {"Authorization": f"Bearer {api_key}"}
|
|
|
|
|
|
def http_get(
|
|
url: str,
|
|
params: Optional[dict[str, Any]] = None,
|
|
*,
|
|
max_retries: int = 8,
|
|
backoff: float = 2.0,
|
|
) -> requests.Response:
|
|
"""HTTP GET with exponential backoff and 429 handling."""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = requests.get(
|
|
url,
|
|
params=params or {},
|
|
headers=_headers(),
|
|
timeout=30,
|
|
)
|
|
except requests.RequestException:
|
|
time.sleep((backoff**attempt) + 0.5)
|
|
if attempt == max_retries - 1:
|
|
raise
|
|
continue
|
|
|
|
if response.status_code == 429:
|
|
retry_after = response.headers.get("Retry-After")
|
|
wait = (
|
|
float(retry_after)
|
|
if retry_after and retry_after.isdigit()
|
|
else min((backoff**attempt) + 1.0, 60.0)
|
|
)
|
|
time.sleep(wait)
|
|
if attempt == max_retries - 1:
|
|
response.raise_for_status()
|
|
continue
|
|
|
|
if 500 <= response.status_code < 600:
|
|
time.sleep(min((backoff**attempt) + 1.0, 60.0))
|
|
if attempt == max_retries - 1:
|
|
response.raise_for_status()
|
|
continue
|
|
|
|
response.raise_for_status()
|
|
return response
|
|
raise RuntimeError("Unreachable")
|
|
|
|
|
|
def fetch_ticker_details(symbol: str) -> dict[str, Any]:
|
|
"""Fetch company metadata from Polygon."""
|
|
response = http_get(f"{BASE}/v3/reference/tickers/{symbol}")
|
|
return response.json().get("results", {}) or {}
|
|
|
|
|
|
def fetch_ohlc(symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
|
|
"""Fetch daily OHLC data from Polygon."""
|
|
response = http_get(
|
|
f"{BASE}/v2/aggs/ticker/{symbol}/range/1/day/{start_date}/{end_date}",
|
|
params={"adjusted": "true", "sort": "asc", "limit": 50000},
|
|
)
|
|
results = response.json().get("results") or []
|
|
rows: list[dict[str, Any]] = []
|
|
for item in results:
|
|
rows.append(
|
|
{
|
|
"date": datetime.fromtimestamp(
|
|
int(item["t"]) / 1000,
|
|
tz=timezone.utc,
|
|
).date().isoformat(),
|
|
"open": item.get("o"),
|
|
"high": item.get("h"),
|
|
"low": item.get("l"),
|
|
"close": item.get("c"),
|
|
"volume": item.get("v"),
|
|
"vwap": item.get("vw"),
|
|
"transactions": item.get("n"),
|
|
}
|
|
)
|
|
return rows
|
|
|
|
|
|
def fetch_news(
|
|
symbol: str,
|
|
start_date: str,
|
|
end_date: str,
|
|
*,
|
|
per_page: int = 50,
|
|
page_sleep: float = 1.2,
|
|
max_pages: Optional[int] = None,
|
|
) -> list[dict[str, Any]]:
|
|
"""Fetch all Polygon news for a ticker, with pagination."""
|
|
url = f"{BASE}/v2/reference/news"
|
|
params = {
|
|
"ticker": symbol,
|
|
"published_utc.gte": start_date,
|
|
"published_utc.lte": end_date,
|
|
"limit": per_page,
|
|
"order": "asc",
|
|
}
|
|
next_url: Optional[str] = None
|
|
pages = 0
|
|
all_articles: list[dict[str, Any]] = []
|
|
seen_ids: set[str] = set()
|
|
|
|
while True:
|
|
response = http_get(next_url or url, params=None if next_url else params)
|
|
data = response.json()
|
|
results = data.get("results") or []
|
|
if not results:
|
|
break
|
|
|
|
for item in results:
|
|
article_id = item.get("id")
|
|
if article_id and article_id in seen_ids:
|
|
continue
|
|
all_articles.append(
|
|
{
|
|
"id": article_id,
|
|
"publisher": (item.get("publisher") or {}).get("name"),
|
|
"title": item.get("title"),
|
|
"author": item.get("author"),
|
|
"published_utc": item.get("published_utc"),
|
|
"amp_url": item.get("amp_url"),
|
|
"article_url": item.get("article_url"),
|
|
"tickers": item.get("tickers"),
|
|
"description": item.get("description"),
|
|
"insights": item.get("insights"),
|
|
}
|
|
)
|
|
if article_id:
|
|
seen_ids.add(article_id)
|
|
|
|
next_url = data.get("next_url")
|
|
pages += 1
|
|
if max_pages is not None and pages >= max_pages:
|
|
break
|
|
if not next_url:
|
|
break
|
|
time.sleep(page_sleep)
|
|
|
|
return all_articles
|