# -*- coding: utf-8 -*- """Long-lived Polygon-backed market research storage.""" from __future__ import annotations import hashlib import json import os import sqlite3 from datetime import datetime, timezone from pathlib import Path from typing import Any, Iterable, Optional SCHEMA = """ CREATE TABLE IF NOT EXISTS tickers ( symbol TEXT PRIMARY KEY, name TEXT, sector TEXT, is_active INTEGER DEFAULT 1, last_price_fetch TEXT, last_news_fetch TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); CREATE TABLE IF NOT EXISTS ohlc ( symbol TEXT NOT NULL, date TEXT NOT NULL, open REAL, high REAL, low REAL, close REAL, volume REAL, vwap REAL, transactions INTEGER, source TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (symbol, date) ); CREATE INDEX IF NOT EXISTS idx_ohlc_symbol_date ON ohlc(symbol, date DESC); CREATE TABLE IF NOT EXISTS news_raw ( id TEXT PRIMARY KEY, published_utc TEXT, title TEXT, summary TEXT, publisher TEXT, author TEXT, article_url TEXT, amp_url TEXT, source TEXT, related_json TEXT, insights_json TEXT, raw_json TEXT NOT NULL, created_at TEXT NOT NULL, updated_at TEXT NOT NULL ); CREATE INDEX IF NOT EXISTS idx_news_raw_published ON news_raw(published_utc DESC); CREATE TABLE IF NOT EXISTS news_ticker ( news_id TEXT NOT NULL, symbol TEXT NOT NULL, trade_date TEXT, PRIMARY KEY (news_id, symbol) ); CREATE INDEX IF NOT EXISTS idx_news_ticker_symbol_date ON news_ticker(symbol, trade_date DESC); CREATE TABLE IF NOT EXISTS news_analysis ( news_id TEXT NOT NULL, symbol TEXT NOT NULL, trade_date TEXT, relevance TEXT, sentiment TEXT, key_discussion TEXT, summary TEXT, reason_growth TEXT, reason_decrease TEXT, ret_t0 REAL, ret_t1 REAL, ret_t3 REAL, ret_t5 REAL, ret_t10 REAL, analysis_source TEXT, raw_json TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (news_id, symbol), FOREIGN KEY (news_id) REFERENCES news_raw(id) ON DELETE CASCADE ); CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_date ON news_analysis(symbol, trade_date DESC); CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_sentiment ON news_analysis(symbol, sentiment, trade_date DESC); CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_relevance ON news_analysis(symbol, relevance, trade_date DESC); CREATE TABLE IF NOT EXISTS story_cache ( symbol TEXT NOT NULL, as_of_date TEXT NOT NULL, content TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (symbol, as_of_date) ); CREATE TABLE IF NOT EXISTS similar_day_cache ( symbol TEXT NOT NULL, target_date TEXT NOT NULL, payload_json TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL, updated_at TEXT NOT NULL, PRIMARY KEY (symbol, target_date) ); """ def get_market_db_path() -> Path: """Resolve the long-lived market database path.""" raw = os.getenv("MARKET_DB_PATH", "").strip() if raw: return Path(raw).expanduser() return Path(__file__).resolve().parents[2] / "data" / "market_research.db" def _json_dumps(value: Any) -> str: return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str) def _json_loads(value: str | None) -> Any: if not value: return None try: return json.loads(value) except json.JSONDecodeError: return value def _hash_news_id(symbol: str, article: dict[str, Any], fallback_index: int) -> str: base = article.get("id") or article.get("article_url") or article.get("title") or f"{symbol}-{fallback_index}" digest = hashlib.sha1(str(base).encode("utf-8")).hexdigest() return article.get("id") or f"polygon:{symbol}:{digest}" def _utc_timestamp() -> str: return datetime.now(timezone.utc).isoformat(timespec="seconds") class MarketStore: """SQLite-backed market research warehouse. Use get_instance() for the singleton.""" _instance: Optional["MarketStore"] = None def __new__(cls, db_path: Path | None = None) -> "MarketStore": if cls._instance is not None: if db_path is None or cls._instance.db_path == Path(db_path or get_market_db_path()): return cls._instance instance = super().__new__(cls) cls._instance = instance return instance def __init__(self, db_path: Path | None = None): if getattr(self, "_initialized", False): return self.db_path = Path(db_path or get_market_db_path()) self.db_path.parent.mkdir(parents=True, exist_ok=True) self._init_db() self._initialized = True @classmethod def get_instance(cls, db_path: Path | None = None) -> "MarketStore": """Get the MarketStore singleton instance.""" return cls(db_path) def _connect(self) -> sqlite3.Connection: conn = sqlite3.connect(self.db_path) conn.row_factory = sqlite3.Row conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") return conn def _init_db(self): with self._connect() as conn: conn.executescript(SCHEMA) def upsert_ticker( self, *, symbol: str, name: str | None = None, sector: str | None = None, is_active: bool = True, ) -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: conn.execute( """ INSERT INTO tickers (symbol, name, sector, is_active, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(symbol) DO UPDATE SET name = COALESCE(excluded.name, tickers.name), sector = COALESCE(excluded.sector, tickers.sector), is_active = excluded.is_active, updated_at = excluded.updated_at """, (symbol, name, sector, 1 if is_active else 0, timestamp, timestamp), ) count += 1 return count def update_fetch_watermark( self, *, symbol: str, price_date: str | None = None, news_date: str | None = None, ) -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: conn.execute( """ INSERT INTO tickers (symbol, created_at, updated_at, last_price_fetch, last_news_fetch) VALUES (?, ?, ?, ?, ?) ON CONFLICT(symbol) DO UPDATE SET last_price_fetch = COALESCE(excluded.last_price_fetch, tickers.last_price_fetch), last_news_fetch = COALESCE(excluded.last_news_fetch, tickers.last_news_fetch), updated_at = excluded.updated_at """, (symbol, timestamp, timestamp, price_date, news_date), ) count += 1 return count def get_ticker_watermarks(self, symbol: str) -> dict[str, Any]: with self._connect() as conn: row = conn.execute( """ SELECT symbol, last_price_fetch, last_news_fetch FROM tickers WHERE symbol = ? """, (symbol,), ).fetchone() return dict(row) if row else { "symbol": symbol, "last_price_fetch": None, "last_news_fetch": None, } def get_latest_news_date(self, symbol: str) -> str | None: """Return the latest stored published news date for one ticker.""" with self._connect() as conn: row = conn.execute( """ SELECT MAX(substr(nr.published_utc, 1, 10)) AS latest_date FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id WHERE nt.symbol = ? """, (symbol,), ).fetchone() return str(row["latest_date"]).strip() if row and row["latest_date"] else None def upsert_ohlc(self, symbol: str, rows: Iterable[dict[str, Any]], *, source: str = "polygon") -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: for row in rows: if not row.get("date"): continue conn.execute( """ INSERT INTO ohlc (symbol, date, open, high, low, close, volume, vwap, transactions, source, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(symbol, date) DO UPDATE SET open = excluded.open, high = excluded.high, low = excluded.low, close = excluded.close, volume = excluded.volume, vwap = excluded.vwap, transactions = excluded.transactions, source = excluded.source, updated_at = excluded.updated_at """, ( symbol, row.get("date"), row.get("open"), row.get("high"), row.get("low"), row.get("close"), row.get("volume"), row.get("vwap"), row.get("transactions"), source, timestamp, timestamp, ), ) count += 1 return count def upsert_news(self, symbol: str, articles: Iterable[dict[str, Any]], *, source: str = "polygon") -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: for index, article in enumerate(articles): news_id = _hash_news_id(symbol, article, index) tickers = article.get("tickers") or [symbol] conn.execute( """ INSERT INTO news_raw (id, published_utc, title, summary, publisher, author, article_url, amp_url, source, related_json, insights_json, raw_json, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO UPDATE SET published_utc = excluded.published_utc, title = excluded.title, summary = excluded.summary, publisher = excluded.publisher, author = excluded.author, article_url = excluded.article_url, amp_url = excluded.amp_url, source = excluded.source, related_json = excluded.related_json, insights_json = excluded.insights_json, raw_json = excluded.raw_json, updated_at = excluded.updated_at """, ( news_id, article.get("published_utc"), article.get("title"), article.get("description") or article.get("summary"), article.get("publisher"), article.get("author"), article.get("article_url"), article.get("amp_url"), source, _json_dumps(tickers), _json_dumps(article.get("insights")) if article.get("insights") else None, _json_dumps(article), timestamp, timestamp, ), ) count += 1 for ticker in tickers: conn.execute( """ INSERT OR IGNORE INTO news_ticker (news_id, symbol, trade_date) VALUES (?, ?, NULL) """, (news_id, str(ticker).strip().upper()), ) return count def get_news_without_trade_date(self, symbol: str | None = None, *, limit: int = 5000) -> list[dict[str, Any]]: sql = """ SELECT nt.news_id, nt.symbol, nr.published_utc FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id WHERE nt.trade_date IS NULL """ params: list[Any] = [] if symbol: sql += " AND nt.symbol = ?" params.append(symbol) sql += " ORDER BY nr.published_utc ASC LIMIT ?" params.append(max(1, int(limit))) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return [dict(row) for row in rows] def set_trade_dates(self, rows: Iterable[dict[str, str]]) -> int: count = 0 with self._connect() as conn: for row in rows: conn.execute( """ UPDATE news_ticker SET trade_date = ? WHERE news_id = ? AND symbol = ? """, (row["trade_date"], row["news_id"], row["symbol"]), ) count += 1 return count def get_ohlc(self, symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]: with self._connect() as conn: rows = conn.execute( """ SELECT symbol, date, open, high, low, close, volume, vwap, transactions, source FROM ohlc WHERE symbol = ? AND date >= ? AND date <= ? ORDER BY date ASC """, (symbol, start_date, end_date), ).fetchall() return [dict(row) for row in rows] def upsert_news_analysis( self, symbol: str, rows: Iterable[dict[str, Any]], *, analysis_source: str = "local", ) -> int: timestamp = _utc_timestamp() normalized_symbol = str(symbol or "").strip().upper() if not normalized_symbol: return 0 count = 0 with self._connect() as conn: for row in rows: news_id = str(row.get("news_id") or row.get("id") or "").strip() if not news_id: continue conn.execute( """ INSERT INTO news_analysis (news_id, symbol, trade_date, relevance, sentiment, key_discussion, summary, reason_growth, reason_decrease, ret_t0, ret_t1, ret_t3, ret_t5, ret_t10, analysis_source, raw_json, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(news_id, symbol) DO UPDATE SET trade_date = excluded.trade_date, relevance = excluded.relevance, sentiment = excluded.sentiment, key_discussion = excluded.key_discussion, summary = excluded.summary, reason_growth = excluded.reason_growth, reason_decrease = excluded.reason_decrease, ret_t0 = excluded.ret_t0, ret_t1 = excluded.ret_t1, ret_t3 = excluded.ret_t3, ret_t5 = excluded.ret_t5, ret_t10 = excluded.ret_t10, analysis_source = excluded.analysis_source, raw_json = excluded.raw_json, updated_at = excluded.updated_at """, ( news_id, normalized_symbol, row.get("trade_date"), row.get("relevance"), row.get("sentiment"), row.get("key_discussion"), row.get("summary"), row.get("reason_growth"), row.get("reason_decrease"), row.get("ret_t0"), row.get("ret_t1"), row.get("ret_t3"), row.get("ret_t5"), row.get("ret_t10"), row.get("analysis_source") or analysis_source, _json_dumps(row.get("raw_json") or row), timestamp, timestamp, ), ) count += 1 return count def get_analyzed_news_ids( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, ) -> set[str]: """Return already analyzed news ids for a symbol and optional date window.""" sql = """ SELECT na.news_id FROM news_analysis na LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol LEFT JOIN news_raw nr ON nr.id = na.news_id WHERE na.symbol = ? """ params: list[Any] = [symbol] if start_date: sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return {str(row["news_id"]) for row in rows if row["news_id"]} def get_analyzed_news_sources( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, ) -> dict[str, str]: """Return analyzed news ids mapped to analysis source.""" sql = """ SELECT na.news_id, na.analysis_source FROM news_analysis na LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol LEFT JOIN news_raw nr ON nr.id = na.news_id WHERE na.symbol = ? """ params: list[Any] = [symbol] if start_date: sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return { str(row["news_id"]): str(row["analysis_source"] or "").strip().lower() for row in rows if row["news_id"] } @staticmethod def _normalize_enriched_news_row(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]: related = row["related_json"] if isinstance(row, sqlite3.Row) else row.get("related_json") analysis_raw = row["analysis_raw_json"] if isinstance(row, sqlite3.Row) and "analysis_raw_json" in row.keys() else row.get("analysis_raw_json") analysis_meta = _json_loads(analysis_raw) return { "id": row["id"], "ticker": row["symbol"], "date": row["published_utc"] or row["trade_date"], "trade_date": row["trade_date"], "source": row["publisher"] or row["raw_source"] or "polygon", "title": row["title"], "summary": row["analysis_summary"] or row["summary"], "url": row["article_url"], "related": _json_loads(related), "category": row["category"] if isinstance(row, sqlite3.Row) and "category" in row.keys() else "", "relevance": row["relevance"], "sentiment": row["sentiment"], "key_discussion": row["key_discussion"], "reason_growth": row["reason_growth"], "reason_decrease": row["reason_decrease"], "ret_t0": row["ret_t0"], "ret_t1": row["ret_t1"], "ret_t3": row["ret_t3"], "ret_t5": row["ret_t5"], "ret_t10": row["ret_t10"], "analysis_source": row["analysis_source"], "analysis_model_label": analysis_meta.get("model_label") if isinstance(analysis_meta, dict) else None, } def get_news_items_enriched( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, trade_date: str | None = None, limit: int = 100, ) -> list[dict[str, Any]]: sql = """ SELECT nr.id, nt.symbol, nr.published_utc, nt.trade_date, nr.publisher, nr.source AS raw_source, nr.title, nr.summary, nr.article_url, nr.related_json, na.relevance, na.sentiment, na.key_discussion, na.summary AS analysis_summary, na.reason_growth, na.reason_decrease, na.ret_t0, na.ret_t1, na.ret_t3, na.ret_t5, na.ret_t10, na.analysis_source, na.raw_json AS analysis_raw_json FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol WHERE nt.symbol = ? """ params: list[Any] = [symbol] if trade_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) = ?" params.append(trade_date) else: if start_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?" params.append(max(1, int(limit))) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return [self._normalize_enriched_news_row(row) for row in rows] def get_news_items( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, limit: int = 100, ) -> list[dict[str, Any]]: sql = """ SELECT nr.id, nt.symbol, nr.published_utc, nt.trade_date, nr.publisher, nr.title, nr.summary, nr.article_url, nr.related_json FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id WHERE nt.symbol = ? """ params: list[Any] = [symbol] if start_date: sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?" params.append(max(1, int(limit))) with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return [ { "id": row["id"], "ticker": row["symbol"], "date": row["published_utc"] or row["trade_date"], "trade_date": row["trade_date"], "source": row["publisher"] or "polygon", "title": row["title"], "summary": row["summary"], "url": row["article_url"], "related": _json_loads(row["related_json"]), "category": "", } for row in rows ] def get_news_timeline( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, ) -> list[dict[str, Any]]: sql = """ SELECT COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) AS date, COUNT(*) AS count, COUNT(DISTINCT nr.publisher) AS source_count, MAX(nr.title) AS top_title FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id WHERE nt.symbol = ? """ params: list[Any] = [symbol] if start_date: sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) sql += """ GROUP BY COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) ORDER BY date ASC """ with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return [ { "date": row["date"], "count": int(row["count"] or 0), "source_count": int(row["source_count"] or 0), "top_title": row["top_title"] or "", } for row in rows if row["date"] ] def get_news_timeline_enriched( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, ) -> list[dict[str, Any]]: sql = """ SELECT COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) AS date, COUNT(*) AS count, COUNT(DISTINCT nr.publisher) AS source_count, MAX(nr.title) AS top_title, SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'positive' THEN 1 ELSE 0 END) AS positive_count, SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'negative' THEN 1 ELSE 0 END) AS negative_count, SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) IN ('neutral', '') OR na.sentiment IS NULL THEN 1 ELSE 0 END) AS neutral_count, SUM(CASE WHEN LOWER(COALESCE(na.relevance, '')) IN ('high', 'relevant') THEN 1 ELSE 0 END) AS high_relevance_count FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol WHERE nt.symbol = ? """ params: list[Any] = [symbol] if start_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) sql += """ GROUP BY COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) ORDER BY date ASC """ with self._connect() as conn: rows = conn.execute(sql, params).fetchall() return [ { "date": row["date"], "count": int(row["count"] or 0), "source_count": int(row["source_count"] or 0), "top_title": row["top_title"] or "", "positive_count": int(row["positive_count"] or 0), "negative_count": int(row["negative_count"] or 0), "neutral_count": int(row["neutral_count"] or 0), "high_relevance_count": int(row["high_relevance_count"] or 0), } for row in rows if row["date"] ] def get_news_by_ids(self, symbol: str, article_ids: Iterable[str]) -> list[dict[str, Any]]: ids = [str(item).strip() for item in article_ids if str(item).strip()] if not ids: return [] placeholders = ",".join("?" for _ in ids) sql = f""" SELECT nr.id, nt.symbol, nr.published_utc, nt.trade_date, nr.publisher, nr.title, nr.summary, nr.article_url, nr.related_json FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id WHERE nt.symbol = ? AND nr.id IN ({placeholders}) ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC """ with self._connect() as conn: rows = conn.execute(sql, [symbol, *ids]).fetchall() return [ { "id": row["id"], "ticker": row["symbol"], "date": row["published_utc"] or row["trade_date"], "trade_date": row["trade_date"], "source": row["publisher"] or "polygon", "title": row["title"], "summary": row["summary"], "url": row["article_url"], "related": _json_loads(row["related_json"]), "category": "", } for row in rows ] def get_news_by_ids_enriched( self, symbol: str, article_ids: Iterable[str], ) -> list[dict[str, Any]]: ids = [str(item).strip() for item in article_ids if str(item).strip()] if not ids: return [] placeholders = ",".join("?" for _ in ids) sql = f""" SELECT nr.id, nt.symbol, nr.published_utc, nt.trade_date, nr.publisher, nr.source AS raw_source, nr.title, nr.summary, nr.article_url, nr.related_json, na.relevance, na.sentiment, na.key_discussion, na.summary AS analysis_summary, na.reason_growth, na.reason_decrease, na.ret_t0, na.ret_t1, na.ret_t3, na.ret_t5, na.ret_t10, na.analysis_source, na.raw_json AS analysis_raw_json FROM news_ticker nt JOIN news_raw nr ON nr.id = nt.news_id LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol WHERE nt.symbol = ? AND nr.id IN ({placeholders}) ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC """ with self._connect() as conn: rows = conn.execute(sql, [symbol, *ids]).fetchall() return [self._normalize_enriched_news_row(row) for row in rows] def get_news_categories_enriched( self, symbol: str, *, start_date: str | None = None, end_date: str | None = None, limit: int = 200, ) -> dict[str, dict[str, Any]]: rows = self.get_news_items_enriched( symbol, start_date=start_date, end_date=end_date, limit=limit, ) categories: dict[str, dict[str, Any]] = {} keyword_map = { "market": [ "market", "stock", "rally", "sell-off", "selloff", "trading", "wall street", "s&p", "nasdaq", "dow", "index", "bull", "bear", "correction", "volatility", ], "policy": [ "regulation", "fed", "federal reserve", "tariff", "sanction", "interest rate", "policy", "government", "congress", "sec", "trade war", "ban", "legislation", "tax", ], "earnings": [ "earnings", "revenue", "profit", "quarter", "eps", "guidance", "forecast", "income", "sales", "beat", "miss", "outlook", "financial results", ], "product_tech": [ "product", "ai", "chip", "cloud", "launch", "patent", "technology", "innovation", "release", "platform", "model", "software", "hardware", "gpu", "autonomous", ], "competition": [ "competitor", "rival", "market share", "overtake", "compete", "competition", "vs", "versus", "battle", "challenge", ], "management": [ "ceo", "executive", "resign", "layoff", "restructure", "management", "leadership", "appoint", "hire", "board", "chairman", ], } for key in keyword_map: categories[key] = { "label": key, "count": 0, "article_ids": [], "positive_ids": [], "negative_ids": [], "neutral_ids": [], } for row in rows: text = " ".join( str(row.get(field) or "") for field in ( "title", "summary", "key_discussion", "reason_growth", "reason_decrease", ) ).lower() sentiment = str(row.get("sentiment") or "").strip().lower() for category, keywords in keyword_map.items(): if not any(keyword in text for keyword in keywords): continue bucket = categories[category] bucket["count"] += 1 bucket["article_ids"].append(row["id"]) if sentiment == "positive": bucket["positive_ids"].append(row["id"]) elif sentiment == "negative": bucket["negative_ids"].append(row["id"]) else: bucket["neutral_ids"].append(row["id"]) return categories def get_story_cache( self, symbol: str, *, as_of_date: str, ) -> dict[str, Any] | None: with self._connect() as conn: row = conn.execute( """ SELECT symbol, as_of_date, content, source, created_at, updated_at FROM story_cache WHERE symbol = ? AND as_of_date = ? """, (symbol, as_of_date), ).fetchone() return dict(row) if row else None def upsert_story_cache( self, symbol: str, *, as_of_date: str, content: str, source: str = "local", ) -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: conn.execute( """ INSERT INTO story_cache (symbol, as_of_date, content, source, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(symbol, as_of_date) DO UPDATE SET content = excluded.content, source = excluded.source, updated_at = excluded.updated_at """, (symbol, as_of_date, content, source, timestamp, timestamp), ) count += 1 return count def delete_story_cache( self, symbol: str, *, as_of_date: str | None = None, ) -> int: with self._connect() as conn: if as_of_date: result = conn.execute( """ DELETE FROM story_cache WHERE symbol = ? AND as_of_date = ? """, (symbol, as_of_date), ) else: result = conn.execute( """ DELETE FROM story_cache WHERE symbol = ? """, (symbol,), ) return int(result.rowcount or 0) def get_similar_day_cache( self, symbol: str, *, target_date: str, ) -> dict[str, Any] | None: with self._connect() as conn: row = conn.execute( """ SELECT symbol, target_date, payload_json, source, created_at, updated_at FROM similar_day_cache WHERE symbol = ? AND target_date = ? """, (symbol, target_date), ).fetchone() if not row: return None return { "symbol": row["symbol"], "target_date": row["target_date"], "payload": _json_loads(row["payload_json"]), "source": row["source"], "created_at": row["created_at"], "updated_at": row["updated_at"], } def upsert_similar_day_cache( self, symbol: str, *, target_date: str, payload: dict[str, Any], source: str = "local", ) -> int: timestamp = _utc_timestamp() count = 0 with self._connect() as conn: conn.execute( """ INSERT INTO similar_day_cache (symbol, target_date, payload_json, source, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?) ON CONFLICT(symbol, target_date) DO UPDATE SET payload_json = excluded.payload_json, source = excluded.source, updated_at = excluded.updated_at """, (symbol, target_date, _json_dumps(payload), source, timestamp, timestamp), ) count += 1 return count def delete_similar_day_cache( self, symbol: str, *, target_date: str | None = None, ) -> int: with self._connect() as conn: if target_date: result = conn.execute( """ DELETE FROM similar_day_cache WHERE symbol = ? AND target_date = ? """, (symbol, target_date), ) else: result = conn.execute( """ DELETE FROM similar_day_cache WHERE symbol = ? """, (symbol,), ) return int(result.rowcount or 0) def get_enrich_report( self, symbols: list[str] | None = None, *, start_date: str | None = None, end_date: str | None = None, ) -> list[dict[str, Any]]: """Summarize explain enrichment coverage and freshness per ticker.""" sql = """ SELECT nt.symbol AS symbol, COUNT(DISTINCT nt.news_id) AS raw_news_count, COUNT(DISTINCT na.news_id) AS analyzed_news_count, SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'llm' THEN 1 ELSE 0 END) AS llm_count, SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'local' THEN 1 ELSE 0 END) AS local_count, MAX(na.updated_at) AS latest_analysis_at, MAX(nt.trade_date) AS latest_trade_date FROM news_ticker nt LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol LEFT JOIN news_raw nr ON nr.id = nt.news_id WHERE 1 = 1 """ params: list[Any] = [] if symbols: normalized = [str(symbol).strip().upper() for symbol in symbols if str(symbol).strip()] if normalized: placeholders = ",".join("?" for _ in normalized) sql += f" AND nt.symbol IN ({placeholders})" params.extend(normalized) if start_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?" params.append(start_date) if end_date: sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?" params.append(end_date) sql += " GROUP BY nt.symbol ORDER BY nt.symbol ASC" with self._connect() as conn: rows = conn.execute(sql, params).fetchall() report: list[dict[str, Any]] = [] for row in rows: raw_news_count = int(row["raw_news_count"] or 0) analyzed_news_count = int(row["analyzed_news_count"] or 0) coverage_pct = ( round((analyzed_news_count / raw_news_count) * 100, 1) if raw_news_count > 0 else 0.0 ) report.append( { "symbol": row["symbol"], "raw_news_count": raw_news_count, "analyzed_news_count": analyzed_news_count, "coverage_pct": coverage_pct, "llm_count": int(row["llm_count"] or 0), "local_count": int(row["local_count"] or 0), "latest_analysis_at": row["latest_analysis_at"], "latest_trade_date": row["latest_trade_date"], } ) return report