Files
evotraders/backend/data/market_store.py
2026-03-30 17:46:44 +08:00

1107 lines
41 KiB
Python

# -*- coding: utf-8 -*-
"""Long-lived Polygon-backed market research storage."""
from __future__ import annotations
import hashlib
import json
import os
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Iterable, Optional
SCHEMA = """
CREATE TABLE IF NOT EXISTS tickers (
symbol TEXT PRIMARY KEY,
name TEXT,
sector TEXT,
is_active INTEGER DEFAULT 1,
last_price_fetch TEXT,
last_news_fetch TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS ohlc (
symbol TEXT NOT NULL,
date TEXT NOT NULL,
open REAL,
high REAL,
low REAL,
close REAL,
volume REAL,
vwap REAL,
transactions INTEGER,
source TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (symbol, date)
);
CREATE INDEX IF NOT EXISTS idx_ohlc_symbol_date ON ohlc(symbol, date DESC);
CREATE TABLE IF NOT EXISTS news_raw (
id TEXT PRIMARY KEY,
published_utc TEXT,
title TEXT,
summary TEXT,
publisher TEXT,
author TEXT,
article_url TEXT,
amp_url TEXT,
source TEXT,
related_json TEXT,
insights_json TEXT,
raw_json TEXT NOT NULL,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_news_raw_published ON news_raw(published_utc DESC);
CREATE TABLE IF NOT EXISTS news_ticker (
news_id TEXT NOT NULL,
symbol TEXT NOT NULL,
trade_date TEXT,
PRIMARY KEY (news_id, symbol)
);
CREATE INDEX IF NOT EXISTS idx_news_ticker_symbol_date ON news_ticker(symbol, trade_date DESC);
CREATE TABLE IF NOT EXISTS news_analysis (
news_id TEXT NOT NULL,
symbol TEXT NOT NULL,
trade_date TEXT,
relevance TEXT,
sentiment TEXT,
key_discussion TEXT,
summary TEXT,
reason_growth TEXT,
reason_decrease TEXT,
ret_t0 REAL,
ret_t1 REAL,
ret_t3 REAL,
ret_t5 REAL,
ret_t10 REAL,
analysis_source TEXT,
raw_json TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (news_id, symbol),
FOREIGN KEY (news_id) REFERENCES news_raw(id) ON DELETE CASCADE
);
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_date ON news_analysis(symbol, trade_date DESC);
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_sentiment ON news_analysis(symbol, sentiment, trade_date DESC);
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_relevance ON news_analysis(symbol, relevance, trade_date DESC);
CREATE TABLE IF NOT EXISTS story_cache (
symbol TEXT NOT NULL,
as_of_date TEXT NOT NULL,
content TEXT NOT NULL,
source TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (symbol, as_of_date)
);
CREATE TABLE IF NOT EXISTS similar_day_cache (
symbol TEXT NOT NULL,
target_date TEXT NOT NULL,
payload_json TEXT NOT NULL,
source TEXT,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL,
PRIMARY KEY (symbol, target_date)
);
"""
def get_market_db_path() -> Path:
"""Resolve the long-lived market database path."""
raw = os.getenv("MARKET_DB_PATH", "").strip()
if raw:
return Path(raw).expanduser()
return Path(__file__).resolve().parents[2] / "data" / "market_research.db"
def _json_dumps(value: Any) -> str:
return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str)
def _json_loads(value: str | None) -> Any:
if not value:
return None
try:
return json.loads(value)
except json.JSONDecodeError:
return value
def _hash_news_id(symbol: str, article: dict[str, Any], fallback_index: int) -> str:
base = article.get("id") or article.get("article_url") or article.get("title") or f"{symbol}-{fallback_index}"
digest = hashlib.sha1(str(base).encode("utf-8")).hexdigest()
return article.get("id") or f"polygon:{symbol}:{digest}"
def _utc_timestamp() -> str:
return datetime.now(timezone.utc).isoformat(timespec="seconds")
class MarketStore:
"""SQLite-backed market research warehouse. Use get_instance() for the singleton."""
_instance: Optional["MarketStore"] = None
def __new__(cls, db_path: Path | None = None) -> "MarketStore":
if cls._instance is not None:
if db_path is None or cls._instance.db_path == Path(db_path or get_market_db_path()):
return cls._instance
instance = super().__new__(cls)
cls._instance = instance
return instance
def __init__(self, db_path: Path | None = None):
if getattr(self, "_initialized", False):
return
self.db_path = Path(db_path or get_market_db_path())
self.db_path.parent.mkdir(parents=True, exist_ok=True)
self._init_db()
self._initialized = True
@classmethod
def get_instance(cls, db_path: Path | None = None) -> "MarketStore":
"""Get the MarketStore singleton instance."""
return cls(db_path)
def _connect(self) -> sqlite3.Connection:
conn = sqlite3.connect(self.db_path)
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
return conn
def _init_db(self):
with self._connect() as conn:
conn.executescript(SCHEMA)
def upsert_ticker(
self,
*,
symbol: str,
name: str | None = None,
sector: str | None = None,
is_active: bool = True,
) -> None:
timestamp = _utc_timestamp()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO tickers
(symbol, name, sector, is_active, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(symbol) DO UPDATE SET
name = COALESCE(excluded.name, tickers.name),
sector = COALESCE(excluded.sector, tickers.sector),
is_active = excluded.is_active,
updated_at = excluded.updated_at
""",
(symbol, name, sector, 1 if is_active else 0, timestamp, timestamp),
)
def update_fetch_watermark(
self,
*,
symbol: str,
price_date: str | None = None,
news_date: str | None = None,
) -> None:
timestamp = _utc_timestamp()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO tickers (symbol, created_at, updated_at, last_price_fetch, last_news_fetch)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(symbol) DO UPDATE SET
last_price_fetch = COALESCE(excluded.last_price_fetch, tickers.last_price_fetch),
last_news_fetch = COALESCE(excluded.last_news_fetch, tickers.last_news_fetch),
updated_at = excluded.updated_at
""",
(symbol, timestamp, timestamp, price_date, news_date),
)
def get_ticker_watermarks(self, symbol: str) -> dict[str, Any]:
with self._connect() as conn:
row = conn.execute(
"""
SELECT symbol, last_price_fetch, last_news_fetch
FROM tickers
WHERE symbol = ?
""",
(symbol,),
).fetchone()
return dict(row) if row else {
"symbol": symbol,
"last_price_fetch": None,
"last_news_fetch": None,
}
def get_latest_news_date(self, symbol: str) -> str | None:
"""Return the latest stored published news date for one ticker."""
with self._connect() as conn:
row = conn.execute(
"""
SELECT MAX(substr(nr.published_utc, 1, 10)) AS latest_date
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
WHERE nt.symbol = ?
""",
(symbol,),
).fetchone()
return str(row["latest_date"]).strip() if row and row["latest_date"] else None
def upsert_ohlc(self, symbol: str, rows: Iterable[dict[str, Any]], *, source: str = "polygon") -> int:
timestamp = _utc_timestamp()
count = 0
with self._connect() as conn:
for row in rows:
conn.execute(
"""
INSERT INTO ohlc
(symbol, date, open, high, low, close, volume, vwap, transactions, source, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(symbol, date) DO UPDATE SET
open = excluded.open,
high = excluded.high,
low = excluded.low,
close = excluded.close,
volume = excluded.volume,
vwap = excluded.vwap,
transactions = excluded.transactions,
source = excluded.source,
updated_at = excluded.updated_at
""",
(
symbol,
row.get("date"),
row.get("open"),
row.get("high"),
row.get("low"),
row.get("close"),
row.get("volume"),
row.get("vwap"),
row.get("transactions"),
source,
timestamp,
timestamp,
),
)
count += 1
return count
def upsert_news(self, symbol: str, articles: Iterable[dict[str, Any]], *, source: str = "polygon") -> int:
timestamp = _utc_timestamp()
count = 0
with self._connect() as conn:
for index, article in enumerate(articles):
news_id = _hash_news_id(symbol, article, index)
tickers = article.get("tickers") or [symbol]
conn.execute(
"""
INSERT INTO news_raw
(id, published_utc, title, summary, publisher, author, article_url, amp_url,
source, related_json, insights_json, raw_json, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO UPDATE SET
published_utc = excluded.published_utc,
title = excluded.title,
summary = excluded.summary,
publisher = excluded.publisher,
author = excluded.author,
article_url = excluded.article_url,
amp_url = excluded.amp_url,
source = excluded.source,
related_json = excluded.related_json,
insights_json = excluded.insights_json,
raw_json = excluded.raw_json,
updated_at = excluded.updated_at
""",
(
news_id,
article.get("published_utc"),
article.get("title"),
article.get("description") or article.get("summary"),
article.get("publisher"),
article.get("author"),
article.get("article_url"),
article.get("amp_url"),
source,
_json_dumps(tickers),
_json_dumps(article.get("insights")) if article.get("insights") else None,
_json_dumps(article),
timestamp,
timestamp,
),
)
for ticker in tickers:
conn.execute(
"""
INSERT OR IGNORE INTO news_ticker (news_id, symbol, trade_date)
VALUES (?, ?, NULL)
""",
(news_id, str(ticker).strip().upper()),
)
count += 1
return count
def get_news_without_trade_date(self, symbol: str | None = None, *, limit: int = 5000) -> list[dict[str, Any]]:
sql = """
SELECT nt.news_id, nt.symbol, nr.published_utc
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
WHERE nt.trade_date IS NULL
"""
params: list[Any] = []
if symbol:
sql += " AND nt.symbol = ?"
params.append(symbol)
sql += " ORDER BY nr.published_utc ASC LIMIT ?"
params.append(max(1, int(limit)))
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return [dict(row) for row in rows]
def set_trade_dates(self, rows: Iterable[dict[str, str]]) -> int:
count = 0
with self._connect() as conn:
for row in rows:
conn.execute(
"""
UPDATE news_ticker
SET trade_date = ?
WHERE news_id = ? AND symbol = ?
""",
(row["trade_date"], row["news_id"], row["symbol"]),
)
count += 1
return count
def get_ohlc(self, symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
with self._connect() as conn:
rows = conn.execute(
"""
SELECT symbol, date, open, high, low, close, volume, vwap, transactions, source
FROM ohlc
WHERE symbol = ? AND date >= ? AND date <= ?
ORDER BY date ASC
""",
(symbol, start_date, end_date),
).fetchall()
return [dict(row) for row in rows]
def upsert_news_analysis(
self,
symbol: str,
rows: Iterable[dict[str, Any]],
*,
analysis_source: str = "local",
) -> int:
timestamp = _utc_timestamp()
normalized_symbol = str(symbol or "").strip().upper()
if not normalized_symbol:
return 0
count = 0
with self._connect() as conn:
for row in rows:
news_id = str(row.get("news_id") or row.get("id") or "").strip()
if not news_id:
continue
conn.execute(
"""
INSERT INTO news_analysis
(news_id, symbol, trade_date, relevance, sentiment, key_discussion, summary,
reason_growth, reason_decrease, ret_t0, ret_t1, ret_t3, ret_t5, ret_t10,
analysis_source, raw_json, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(news_id, symbol) DO UPDATE SET
trade_date = excluded.trade_date,
relevance = excluded.relevance,
sentiment = excluded.sentiment,
key_discussion = excluded.key_discussion,
summary = excluded.summary,
reason_growth = excluded.reason_growth,
reason_decrease = excluded.reason_decrease,
ret_t0 = excluded.ret_t0,
ret_t1 = excluded.ret_t1,
ret_t3 = excluded.ret_t3,
ret_t5 = excluded.ret_t5,
ret_t10 = excluded.ret_t10,
analysis_source = excluded.analysis_source,
raw_json = excluded.raw_json,
updated_at = excluded.updated_at
""",
(
news_id,
normalized_symbol,
row.get("trade_date"),
row.get("relevance"),
row.get("sentiment"),
row.get("key_discussion"),
row.get("summary"),
row.get("reason_growth"),
row.get("reason_decrease"),
row.get("ret_t0"),
row.get("ret_t1"),
row.get("ret_t3"),
row.get("ret_t5"),
row.get("ret_t10"),
row.get("analysis_source") or analysis_source,
_json_dumps(row.get("raw_json") or row),
timestamp,
timestamp,
),
)
count += 1
return count
def get_analyzed_news_ids(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
) -> set[str]:
"""Return already analyzed news ids for a symbol and optional date window."""
sql = """
SELECT na.news_id
FROM news_analysis na
LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol
LEFT JOIN news_raw nr ON nr.id = na.news_id
WHERE na.symbol = ?
"""
params: list[Any] = [symbol]
if start_date:
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return {str(row["news_id"]) for row in rows if row["news_id"]}
def get_analyzed_news_sources(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
) -> dict[str, str]:
"""Return analyzed news ids mapped to analysis source."""
sql = """
SELECT na.news_id, na.analysis_source
FROM news_analysis na
LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol
LEFT JOIN news_raw nr ON nr.id = na.news_id
WHERE na.symbol = ?
"""
params: list[Any] = [symbol]
if start_date:
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return {
str(row["news_id"]): str(row["analysis_source"] or "").strip().lower()
for row in rows
if row["news_id"]
}
@staticmethod
def _normalize_enriched_news_row(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
related = row["related_json"] if isinstance(row, sqlite3.Row) else row.get("related_json")
analysis_raw = row["analysis_raw_json"] if isinstance(row, sqlite3.Row) and "analysis_raw_json" in row.keys() else row.get("analysis_raw_json")
analysis_meta = _json_loads(analysis_raw)
return {
"id": row["id"],
"ticker": row["symbol"],
"date": row["published_utc"] or row["trade_date"],
"trade_date": row["trade_date"],
"source": row["publisher"] or row["raw_source"] or "polygon",
"title": row["title"],
"summary": row["analysis_summary"] or row["summary"],
"url": row["article_url"],
"related": _json_loads(related),
"category": row["category"] if isinstance(row, sqlite3.Row) and "category" in row.keys() else "",
"relevance": row["relevance"],
"sentiment": row["sentiment"],
"key_discussion": row["key_discussion"],
"reason_growth": row["reason_growth"],
"reason_decrease": row["reason_decrease"],
"ret_t0": row["ret_t0"],
"ret_t1": row["ret_t1"],
"ret_t3": row["ret_t3"],
"ret_t5": row["ret_t5"],
"ret_t10": row["ret_t10"],
"analysis_source": row["analysis_source"],
"analysis_model_label": analysis_meta.get("model_label") if isinstance(analysis_meta, dict) else None,
}
def get_news_items_enriched(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
trade_date: str | None = None,
limit: int = 100,
) -> list[dict[str, Any]]:
sql = """
SELECT nr.id,
nt.symbol,
nr.published_utc,
nt.trade_date,
nr.publisher,
nr.source AS raw_source,
nr.title,
nr.summary,
nr.article_url,
nr.related_json,
na.relevance,
na.sentiment,
na.key_discussion,
na.summary AS analysis_summary,
na.reason_growth,
na.reason_decrease,
na.ret_t0,
na.ret_t1,
na.ret_t3,
na.ret_t5,
na.ret_t10,
na.analysis_source,
na.raw_json AS analysis_raw_json
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
WHERE nt.symbol = ?
"""
params: list[Any] = [symbol]
if trade_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) = ?"
params.append(trade_date)
else:
if start_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?"
params.append(max(1, int(limit)))
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return [self._normalize_enriched_news_row(row) for row in rows]
def get_news_items(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
limit: int = 100,
) -> list[dict[str, Any]]:
sql = """
SELECT nr.id,
nt.symbol,
nr.published_utc,
nt.trade_date,
nr.publisher,
nr.title,
nr.summary,
nr.article_url,
nr.related_json
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
WHERE nt.symbol = ?
"""
params: list[Any] = [symbol]
if start_date:
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?"
params.append(max(1, int(limit)))
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return [
{
"id": row["id"],
"ticker": row["symbol"],
"date": row["published_utc"] or row["trade_date"],
"trade_date": row["trade_date"],
"source": row["publisher"] or "polygon",
"title": row["title"],
"summary": row["summary"],
"url": row["article_url"],
"related": _json_loads(row["related_json"]),
"category": "",
}
for row in rows
]
def get_news_timeline(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict[str, Any]]:
sql = """
SELECT COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) AS date,
COUNT(*) AS count,
COUNT(DISTINCT nr.publisher) AS source_count,
MAX(nr.title) AS top_title
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
WHERE nt.symbol = ?
"""
params: list[Any] = [symbol]
if start_date:
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
sql += """
GROUP BY COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10))
ORDER BY date ASC
"""
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return [
{
"date": row["date"],
"count": int(row["count"] or 0),
"source_count": int(row["source_count"] or 0),
"top_title": row["top_title"] or "",
}
for row in rows
if row["date"]
]
def get_news_timeline_enriched(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict[str, Any]]:
sql = """
SELECT COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) AS date,
COUNT(*) AS count,
COUNT(DISTINCT nr.publisher) AS source_count,
MAX(nr.title) AS top_title,
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'positive' THEN 1 ELSE 0 END) AS positive_count,
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'negative' THEN 1 ELSE 0 END) AS negative_count,
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) IN ('neutral', '') OR na.sentiment IS NULL THEN 1 ELSE 0 END) AS neutral_count,
SUM(CASE WHEN LOWER(COALESCE(na.relevance, '')) IN ('high', 'relevant') THEN 1 ELSE 0 END) AS high_relevance_count
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
WHERE nt.symbol = ?
"""
params: list[Any] = [symbol]
if start_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
sql += """
GROUP BY COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10))
ORDER BY date ASC
"""
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
return [
{
"date": row["date"],
"count": int(row["count"] or 0),
"source_count": int(row["source_count"] or 0),
"top_title": row["top_title"] or "",
"positive_count": int(row["positive_count"] or 0),
"negative_count": int(row["negative_count"] or 0),
"neutral_count": int(row["neutral_count"] or 0),
"high_relevance_count": int(row["high_relevance_count"] or 0),
}
for row in rows
if row["date"]
]
def get_news_by_ids(self, symbol: str, article_ids: Iterable[str]) -> list[dict[str, Any]]:
ids = [str(item).strip() for item in article_ids if str(item).strip()]
if not ids:
return []
placeholders = ",".join("?" for _ in ids)
sql = f"""
SELECT nr.id,
nt.symbol,
nr.published_utc,
nt.trade_date,
nr.publisher,
nr.title,
nr.summary,
nr.article_url,
nr.related_json
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
WHERE nt.symbol = ? AND nr.id IN ({placeholders})
ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC
"""
with self._connect() as conn:
rows = conn.execute(sql, [symbol, *ids]).fetchall()
return [
{
"id": row["id"],
"ticker": row["symbol"],
"date": row["published_utc"] or row["trade_date"],
"trade_date": row["trade_date"],
"source": row["publisher"] or "polygon",
"title": row["title"],
"summary": row["summary"],
"url": row["article_url"],
"related": _json_loads(row["related_json"]),
"category": "",
}
for row in rows
]
def get_news_by_ids_enriched(
self,
symbol: str,
article_ids: Iterable[str],
) -> list[dict[str, Any]]:
ids = [str(item).strip() for item in article_ids if str(item).strip()]
if not ids:
return []
placeholders = ",".join("?" for _ in ids)
sql = f"""
SELECT nr.id,
nt.symbol,
nr.published_utc,
nt.trade_date,
nr.publisher,
nr.source AS raw_source,
nr.title,
nr.summary,
nr.article_url,
nr.related_json,
na.relevance,
na.sentiment,
na.key_discussion,
na.summary AS analysis_summary,
na.reason_growth,
na.reason_decrease,
na.ret_t0,
na.ret_t1,
na.ret_t3,
na.ret_t5,
na.ret_t10,
na.analysis_source,
na.raw_json AS analysis_raw_json
FROM news_ticker nt
JOIN news_raw nr ON nr.id = nt.news_id
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
WHERE nt.symbol = ? AND nr.id IN ({placeholders})
ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC
"""
with self._connect() as conn:
rows = conn.execute(sql, [symbol, *ids]).fetchall()
return [self._normalize_enriched_news_row(row) for row in rows]
def get_news_categories_enriched(
self,
symbol: str,
*,
start_date: str | None = None,
end_date: str | None = None,
limit: int = 200,
) -> dict[str, dict[str, Any]]:
rows = self.get_news_items_enriched(
symbol,
start_date=start_date,
end_date=end_date,
limit=limit,
)
categories: dict[str, dict[str, Any]] = {}
keyword_map = {
"market": [
"market", "stock", "rally", "sell-off", "selloff", "trading",
"wall street", "s&p", "nasdaq", "dow", "index", "bull", "bear",
"correction", "volatility",
],
"policy": [
"regulation", "fed", "federal reserve", "tariff", "sanction",
"interest rate", "policy", "government", "congress", "sec",
"trade war", "ban", "legislation", "tax",
],
"earnings": [
"earnings", "revenue", "profit", "quarter", "eps", "guidance",
"forecast", "income", "sales", "beat", "miss", "outlook",
"financial results",
],
"product_tech": [
"product", "ai", "chip", "cloud", "launch", "patent",
"technology", "innovation", "release", "platform", "model",
"software", "hardware", "gpu", "autonomous",
],
"competition": [
"competitor", "rival", "market share", "overtake", "compete",
"competition", "vs", "versus", "battle", "challenge",
],
"management": [
"ceo", "executive", "resign", "layoff", "restructure",
"management", "leadership", "appoint", "hire", "board",
"chairman",
],
}
for key in keyword_map:
categories[key] = {
"label": key,
"count": 0,
"article_ids": [],
"positive_ids": [],
"negative_ids": [],
"neutral_ids": [],
}
for row in rows:
text = " ".join(
str(row.get(field) or "")
for field in (
"title",
"summary",
"key_discussion",
"reason_growth",
"reason_decrease",
)
).lower()
sentiment = str(row.get("sentiment") or "").strip().lower()
for category, keywords in keyword_map.items():
if not any(keyword in text for keyword in keywords):
continue
bucket = categories[category]
bucket["count"] += 1
bucket["article_ids"].append(row["id"])
if sentiment == "positive":
bucket["positive_ids"].append(row["id"])
elif sentiment == "negative":
bucket["negative_ids"].append(row["id"])
else:
bucket["neutral_ids"].append(row["id"])
return categories
def get_story_cache(
self,
symbol: str,
*,
as_of_date: str,
) -> dict[str, Any] | None:
with self._connect() as conn:
row = conn.execute(
"""
SELECT symbol, as_of_date, content, source, created_at, updated_at
FROM story_cache
WHERE symbol = ? AND as_of_date = ?
""",
(symbol, as_of_date),
).fetchone()
return dict(row) if row else None
def upsert_story_cache(
self,
symbol: str,
*,
as_of_date: str,
content: str,
source: str = "local",
) -> None:
timestamp = _utc_timestamp()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO story_cache
(symbol, as_of_date, content, source, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(symbol, as_of_date) DO UPDATE SET
content = excluded.content,
source = excluded.source,
updated_at = excluded.updated_at
""",
(symbol, as_of_date, content, source, timestamp, timestamp),
)
def delete_story_cache(
self,
symbol: str,
*,
as_of_date: str | None = None,
) -> int:
with self._connect() as conn:
if as_of_date:
result = conn.execute(
"""
DELETE FROM story_cache
WHERE symbol = ? AND as_of_date = ?
""",
(symbol, as_of_date),
)
else:
result = conn.execute(
"""
DELETE FROM story_cache
WHERE symbol = ?
""",
(symbol,),
)
return int(result.rowcount or 0)
def get_similar_day_cache(
self,
symbol: str,
*,
target_date: str,
) -> dict[str, Any] | None:
with self._connect() as conn:
row = conn.execute(
"""
SELECT symbol, target_date, payload_json, source, created_at, updated_at
FROM similar_day_cache
WHERE symbol = ? AND target_date = ?
""",
(symbol, target_date),
).fetchone()
if not row:
return None
return {
"symbol": row["symbol"],
"target_date": row["target_date"],
"payload": _json_loads(row["payload_json"]),
"source": row["source"],
"created_at": row["created_at"],
"updated_at": row["updated_at"],
}
def upsert_similar_day_cache(
self,
symbol: str,
*,
target_date: str,
payload: dict[str, Any],
source: str = "local",
) -> None:
timestamp = _utc_timestamp()
with self._connect() as conn:
conn.execute(
"""
INSERT INTO similar_day_cache
(symbol, target_date, payload_json, source, created_at, updated_at)
VALUES (?, ?, ?, ?, ?, ?)
ON CONFLICT(symbol, target_date) DO UPDATE SET
payload_json = excluded.payload_json,
source = excluded.source,
updated_at = excluded.updated_at
""",
(symbol, target_date, _json_dumps(payload), source, timestamp, timestamp),
)
def delete_similar_day_cache(
self,
symbol: str,
*,
target_date: str | None = None,
) -> int:
with self._connect() as conn:
if target_date:
result = conn.execute(
"""
DELETE FROM similar_day_cache
WHERE symbol = ? AND target_date = ?
""",
(symbol, target_date),
)
else:
result = conn.execute(
"""
DELETE FROM similar_day_cache
WHERE symbol = ?
""",
(symbol,),
)
return int(result.rowcount or 0)
def get_enrich_report(
self,
symbols: list[str] | None = None,
*,
start_date: str | None = None,
end_date: str | None = None,
) -> list[dict[str, Any]]:
"""Summarize explain enrichment coverage and freshness per ticker."""
sql = """
SELECT nt.symbol AS symbol,
COUNT(DISTINCT nt.news_id) AS raw_news_count,
COUNT(DISTINCT na.news_id) AS analyzed_news_count,
SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'llm' THEN 1 ELSE 0 END) AS llm_count,
SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'local' THEN 1 ELSE 0 END) AS local_count,
MAX(na.updated_at) AS latest_analysis_at,
MAX(nt.trade_date) AS latest_trade_date
FROM news_ticker nt
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
LEFT JOIN news_raw nr ON nr.id = nt.news_id
WHERE 1 = 1
"""
params: list[Any] = []
if symbols:
normalized = [str(symbol).strip().upper() for symbol in symbols if str(symbol).strip()]
if normalized:
placeholders = ",".join("?" for _ in normalized)
sql += f" AND nt.symbol IN ({placeholders})"
params.extend(normalized)
if start_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
params.append(start_date)
if end_date:
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
params.append(end_date)
sql += " GROUP BY nt.symbol ORDER BY nt.symbol ASC"
with self._connect() as conn:
rows = conn.execute(sql, params).fetchall()
report: list[dict[str, Any]] = []
for row in rows:
raw_news_count = int(row["raw_news_count"] or 0)
analyzed_news_count = int(row["analyzed_news_count"] or 0)
coverage_pct = (
round((analyzed_news_count / raw_news_count) * 100, 1)
if raw_news_count > 0
else 0.0
)
report.append(
{
"symbol": row["symbol"],
"raw_news_count": raw_news_count,
"analyzed_news_count": analyzed_news_count,
"coverage_pct": coverage_pct,
"llm_count": int(row["llm_count"] or 0),
"local_count": int(row["local_count"] or 0),
"latest_analysis_at": row["latest_analysis_at"],
"latest_trade_date": row["latest_trade_date"],
}
)
return report