1121 lines
41 KiB
Python
1121 lines
41 KiB
Python
# -*- coding: utf-8 -*-
|
|
"""Long-lived Polygon-backed market research storage."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Iterable, Optional
|
|
|
|
|
|
SCHEMA = """
|
|
CREATE TABLE IF NOT EXISTS tickers (
|
|
symbol TEXT PRIMARY KEY,
|
|
name TEXT,
|
|
sector TEXT,
|
|
is_active INTEGER DEFAULT 1,
|
|
last_price_fetch TEXT,
|
|
last_news_fetch TEXT,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS ohlc (
|
|
symbol TEXT NOT NULL,
|
|
date TEXT NOT NULL,
|
|
open REAL,
|
|
high REAL,
|
|
low REAL,
|
|
close REAL,
|
|
volume REAL,
|
|
vwap REAL,
|
|
transactions INTEGER,
|
|
source TEXT,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL,
|
|
PRIMARY KEY (symbol, date)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_ohlc_symbol_date ON ohlc(symbol, date DESC);
|
|
|
|
CREATE TABLE IF NOT EXISTS news_raw (
|
|
id TEXT PRIMARY KEY,
|
|
published_utc TEXT,
|
|
title TEXT,
|
|
summary TEXT,
|
|
publisher TEXT,
|
|
author TEXT,
|
|
article_url TEXT,
|
|
amp_url TEXT,
|
|
source TEXT,
|
|
related_json TEXT,
|
|
insights_json TEXT,
|
|
raw_json TEXT NOT NULL,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_news_raw_published ON news_raw(published_utc DESC);
|
|
|
|
CREATE TABLE IF NOT EXISTS news_ticker (
|
|
news_id TEXT NOT NULL,
|
|
symbol TEXT NOT NULL,
|
|
trade_date TEXT,
|
|
PRIMARY KEY (news_id, symbol)
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_news_ticker_symbol_date ON news_ticker(symbol, trade_date DESC);
|
|
|
|
CREATE TABLE IF NOT EXISTS news_analysis (
|
|
news_id TEXT NOT NULL,
|
|
symbol TEXT NOT NULL,
|
|
trade_date TEXT,
|
|
relevance TEXT,
|
|
sentiment TEXT,
|
|
key_discussion TEXT,
|
|
summary TEXT,
|
|
reason_growth TEXT,
|
|
reason_decrease TEXT,
|
|
ret_t0 REAL,
|
|
ret_t1 REAL,
|
|
ret_t3 REAL,
|
|
ret_t5 REAL,
|
|
ret_t10 REAL,
|
|
analysis_source TEXT,
|
|
raw_json TEXT,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL,
|
|
PRIMARY KEY (news_id, symbol),
|
|
FOREIGN KEY (news_id) REFERENCES news_raw(id) ON DELETE CASCADE
|
|
);
|
|
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_date ON news_analysis(symbol, trade_date DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_sentiment ON news_analysis(symbol, sentiment, trade_date DESC);
|
|
CREATE INDEX IF NOT EXISTS idx_news_analysis_symbol_relevance ON news_analysis(symbol, relevance, trade_date DESC);
|
|
|
|
CREATE TABLE IF NOT EXISTS story_cache (
|
|
symbol TEXT NOT NULL,
|
|
as_of_date TEXT NOT NULL,
|
|
content TEXT NOT NULL,
|
|
source TEXT,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL,
|
|
PRIMARY KEY (symbol, as_of_date)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS similar_day_cache (
|
|
symbol TEXT NOT NULL,
|
|
target_date TEXT NOT NULL,
|
|
payload_json TEXT NOT NULL,
|
|
source TEXT,
|
|
created_at TEXT NOT NULL,
|
|
updated_at TEXT NOT NULL,
|
|
PRIMARY KEY (symbol, target_date)
|
|
);
|
|
"""
|
|
|
|
|
|
def get_market_db_path() -> Path:
|
|
"""Resolve the long-lived market database path."""
|
|
raw = os.getenv("MARKET_DB_PATH", "").strip()
|
|
if raw:
|
|
return Path(raw).expanduser()
|
|
return Path(__file__).resolve().parents[2] / "data" / "market_research.db"
|
|
|
|
|
|
def _json_dumps(value: Any) -> str:
|
|
return json.dumps(value, ensure_ascii=False, sort_keys=True, default=str)
|
|
|
|
|
|
def _json_loads(value: str | None) -> Any:
|
|
if not value:
|
|
return None
|
|
try:
|
|
return json.loads(value)
|
|
except json.JSONDecodeError:
|
|
return value
|
|
|
|
|
|
def _hash_news_id(symbol: str, article: dict[str, Any], fallback_index: int) -> str:
|
|
base = article.get("id") or article.get("article_url") or article.get("title") or f"{symbol}-{fallback_index}"
|
|
digest = hashlib.sha1(str(base).encode("utf-8")).hexdigest()
|
|
return article.get("id") or f"polygon:{symbol}:{digest}"
|
|
|
|
|
|
def _utc_timestamp() -> str:
|
|
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
|
|
|
|
class MarketStore:
|
|
"""SQLite-backed market research warehouse. Use get_instance() for the singleton."""
|
|
|
|
_instance: Optional["MarketStore"] = None
|
|
|
|
def __new__(cls, db_path: Path | None = None) -> "MarketStore":
|
|
if cls._instance is not None:
|
|
if db_path is None or cls._instance.db_path == Path(db_path or get_market_db_path()):
|
|
return cls._instance
|
|
instance = super().__new__(cls)
|
|
cls._instance = instance
|
|
return instance
|
|
|
|
def __init__(self, db_path: Path | None = None):
|
|
if getattr(self, "_initialized", False):
|
|
return
|
|
self.db_path = Path(db_path or get_market_db_path())
|
|
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
self._init_db()
|
|
self._initialized = True
|
|
|
|
@classmethod
|
|
def get_instance(cls, db_path: Path | None = None) -> "MarketStore":
|
|
"""Get the MarketStore singleton instance."""
|
|
return cls(db_path)
|
|
|
|
def _connect(self) -> sqlite3.Connection:
|
|
conn = sqlite3.connect(self.db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
conn.execute("PRAGMA foreign_keys=ON")
|
|
return conn
|
|
|
|
def _init_db(self):
|
|
with self._connect() as conn:
|
|
conn.executescript(SCHEMA)
|
|
|
|
def upsert_ticker(
|
|
self,
|
|
*,
|
|
symbol: str,
|
|
name: str | None = None,
|
|
sector: str | None = None,
|
|
is_active: bool = True,
|
|
) -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO tickers
|
|
(symbol, name, sector, is_active, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(symbol) DO UPDATE SET
|
|
name = COALESCE(excluded.name, tickers.name),
|
|
sector = COALESCE(excluded.sector, tickers.sector),
|
|
is_active = excluded.is_active,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(symbol, name, sector, 1 if is_active else 0, timestamp, timestamp),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def update_fetch_watermark(
|
|
self,
|
|
*,
|
|
symbol: str,
|
|
price_date: str | None = None,
|
|
news_date: str | None = None,
|
|
) -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO tickers (symbol, created_at, updated_at, last_price_fetch, last_news_fetch)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
ON CONFLICT(symbol) DO UPDATE SET
|
|
last_price_fetch = COALESCE(excluded.last_price_fetch, tickers.last_price_fetch),
|
|
last_news_fetch = COALESCE(excluded.last_news_fetch, tickers.last_news_fetch),
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(symbol, timestamp, timestamp, price_date, news_date),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def get_ticker_watermarks(self, symbol: str) -> dict[str, Any]:
|
|
with self._connect() as conn:
|
|
row = conn.execute(
|
|
"""
|
|
SELECT symbol, last_price_fetch, last_news_fetch
|
|
FROM tickers
|
|
WHERE symbol = ?
|
|
""",
|
|
(symbol,),
|
|
).fetchone()
|
|
return dict(row) if row else {
|
|
"symbol": symbol,
|
|
"last_price_fetch": None,
|
|
"last_news_fetch": None,
|
|
}
|
|
|
|
def get_latest_news_date(self, symbol: str) -> str | None:
|
|
"""Return the latest stored published news date for one ticker."""
|
|
with self._connect() as conn:
|
|
row = conn.execute(
|
|
"""
|
|
SELECT MAX(substr(nr.published_utc, 1, 10)) AS latest_date
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE nt.symbol = ?
|
|
""",
|
|
(symbol,),
|
|
).fetchone()
|
|
return str(row["latest_date"]).strip() if row and row["latest_date"] else None
|
|
|
|
def upsert_ohlc(self, symbol: str, rows: Iterable[dict[str, Any]], *, source: str = "polygon") -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
for row in rows:
|
|
if not row.get("date"):
|
|
continue
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO ohlc
|
|
(symbol, date, open, high, low, close, volume, vwap, transactions, source, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(symbol, date) DO UPDATE SET
|
|
open = excluded.open,
|
|
high = excluded.high,
|
|
low = excluded.low,
|
|
close = excluded.close,
|
|
volume = excluded.volume,
|
|
vwap = excluded.vwap,
|
|
transactions = excluded.transactions,
|
|
source = excluded.source,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(
|
|
symbol,
|
|
row.get("date"),
|
|
row.get("open"),
|
|
row.get("high"),
|
|
row.get("low"),
|
|
row.get("close"),
|
|
row.get("volume"),
|
|
row.get("vwap"),
|
|
row.get("transactions"),
|
|
source,
|
|
timestamp,
|
|
timestamp,
|
|
),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def upsert_news(self, symbol: str, articles: Iterable[dict[str, Any]], *, source: str = "polygon") -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
for index, article in enumerate(articles):
|
|
news_id = _hash_news_id(symbol, article, index)
|
|
tickers = article.get("tickers") or [symbol]
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO news_raw
|
|
(id, published_utc, title, summary, publisher, author, article_url, amp_url,
|
|
source, related_json, insights_json, raw_json, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(id) DO UPDATE SET
|
|
published_utc = excluded.published_utc,
|
|
title = excluded.title,
|
|
summary = excluded.summary,
|
|
publisher = excluded.publisher,
|
|
author = excluded.author,
|
|
article_url = excluded.article_url,
|
|
amp_url = excluded.amp_url,
|
|
source = excluded.source,
|
|
related_json = excluded.related_json,
|
|
insights_json = excluded.insights_json,
|
|
raw_json = excluded.raw_json,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(
|
|
news_id,
|
|
article.get("published_utc"),
|
|
article.get("title"),
|
|
article.get("description") or article.get("summary"),
|
|
article.get("publisher"),
|
|
article.get("author"),
|
|
article.get("article_url"),
|
|
article.get("amp_url"),
|
|
source,
|
|
_json_dumps(tickers),
|
|
_json_dumps(article.get("insights")) if article.get("insights") else None,
|
|
_json_dumps(article),
|
|
timestamp,
|
|
timestamp,
|
|
),
|
|
)
|
|
count += 1
|
|
for ticker in tickers:
|
|
conn.execute(
|
|
"""
|
|
INSERT OR IGNORE INTO news_ticker (news_id, symbol, trade_date)
|
|
VALUES (?, ?, NULL)
|
|
""",
|
|
(news_id, str(ticker).strip().upper()),
|
|
)
|
|
return count
|
|
|
|
def get_news_without_trade_date(self, symbol: str | None = None, *, limit: int = 5000) -> list[dict[str, Any]]:
|
|
sql = """
|
|
SELECT nt.news_id, nt.symbol, nr.published_utc
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE nt.trade_date IS NULL
|
|
"""
|
|
params: list[Any] = []
|
|
if symbol:
|
|
sql += " AND nt.symbol = ?"
|
|
params.append(symbol)
|
|
sql += " ORDER BY nr.published_utc ASC LIMIT ?"
|
|
params.append(max(1, int(limit)))
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return [dict(row) for row in rows]
|
|
|
|
def set_trade_dates(self, rows: Iterable[dict[str, str]]) -> int:
|
|
count = 0
|
|
with self._connect() as conn:
|
|
for row in rows:
|
|
conn.execute(
|
|
"""
|
|
UPDATE news_ticker
|
|
SET trade_date = ?
|
|
WHERE news_id = ? AND symbol = ?
|
|
""",
|
|
(row["trade_date"], row["news_id"], row["symbol"]),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def get_ohlc(self, symbol: str, start_date: str, end_date: str) -> list[dict[str, Any]]:
|
|
with self._connect() as conn:
|
|
rows = conn.execute(
|
|
"""
|
|
SELECT symbol, date, open, high, low, close, volume, vwap, transactions, source
|
|
FROM ohlc
|
|
WHERE symbol = ? AND date >= ? AND date <= ?
|
|
ORDER BY date ASC
|
|
""",
|
|
(symbol, start_date, end_date),
|
|
).fetchall()
|
|
return [dict(row) for row in rows]
|
|
|
|
def upsert_news_analysis(
|
|
self,
|
|
symbol: str,
|
|
rows: Iterable[dict[str, Any]],
|
|
*,
|
|
analysis_source: str = "local",
|
|
) -> int:
|
|
timestamp = _utc_timestamp()
|
|
normalized_symbol = str(symbol or "").strip().upper()
|
|
if not normalized_symbol:
|
|
return 0
|
|
|
|
count = 0
|
|
with self._connect() as conn:
|
|
for row in rows:
|
|
news_id = str(row.get("news_id") or row.get("id") or "").strip()
|
|
if not news_id:
|
|
continue
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO news_analysis
|
|
(news_id, symbol, trade_date, relevance, sentiment, key_discussion, summary,
|
|
reason_growth, reason_decrease, ret_t0, ret_t1, ret_t3, ret_t5, ret_t10,
|
|
analysis_source, raw_json, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(news_id, symbol) DO UPDATE SET
|
|
trade_date = excluded.trade_date,
|
|
relevance = excluded.relevance,
|
|
sentiment = excluded.sentiment,
|
|
key_discussion = excluded.key_discussion,
|
|
summary = excluded.summary,
|
|
reason_growth = excluded.reason_growth,
|
|
reason_decrease = excluded.reason_decrease,
|
|
ret_t0 = excluded.ret_t0,
|
|
ret_t1 = excluded.ret_t1,
|
|
ret_t3 = excluded.ret_t3,
|
|
ret_t5 = excluded.ret_t5,
|
|
ret_t10 = excluded.ret_t10,
|
|
analysis_source = excluded.analysis_source,
|
|
raw_json = excluded.raw_json,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(
|
|
news_id,
|
|
normalized_symbol,
|
|
row.get("trade_date"),
|
|
row.get("relevance"),
|
|
row.get("sentiment"),
|
|
row.get("key_discussion"),
|
|
row.get("summary"),
|
|
row.get("reason_growth"),
|
|
row.get("reason_decrease"),
|
|
row.get("ret_t0"),
|
|
row.get("ret_t1"),
|
|
row.get("ret_t3"),
|
|
row.get("ret_t5"),
|
|
row.get("ret_t10"),
|
|
row.get("analysis_source") or analysis_source,
|
|
_json_dumps(row.get("raw_json") or row),
|
|
timestamp,
|
|
timestamp,
|
|
),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def get_analyzed_news_ids(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> set[str]:
|
|
"""Return already analyzed news ids for a symbol and optional date window."""
|
|
sql = """
|
|
SELECT na.news_id
|
|
FROM news_analysis na
|
|
LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol
|
|
LEFT JOIN news_raw nr ON nr.id = na.news_id
|
|
WHERE na.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if start_date:
|
|
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return {str(row["news_id"]) for row in rows if row["news_id"]}
|
|
|
|
def get_analyzed_news_sources(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> dict[str, str]:
|
|
"""Return analyzed news ids mapped to analysis source."""
|
|
sql = """
|
|
SELECT na.news_id, na.analysis_source
|
|
FROM news_analysis na
|
|
LEFT JOIN news_ticker nt ON nt.news_id = na.news_id AND nt.symbol = na.symbol
|
|
LEFT JOIN news_raw nr ON nr.id = na.news_id
|
|
WHERE na.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if start_date:
|
|
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(na.trade_date, nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return {
|
|
str(row["news_id"]): str(row["analysis_source"] or "").strip().lower()
|
|
for row in rows
|
|
if row["news_id"]
|
|
}
|
|
|
|
@staticmethod
|
|
def _normalize_enriched_news_row(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
|
|
related = row["related_json"] if isinstance(row, sqlite3.Row) else row.get("related_json")
|
|
analysis_raw = row["analysis_raw_json"] if isinstance(row, sqlite3.Row) and "analysis_raw_json" in row.keys() else row.get("analysis_raw_json")
|
|
analysis_meta = _json_loads(analysis_raw)
|
|
return {
|
|
"id": row["id"],
|
|
"ticker": row["symbol"],
|
|
"date": row["published_utc"] or row["trade_date"],
|
|
"trade_date": row["trade_date"],
|
|
"source": row["publisher"] or row["raw_source"] or "polygon",
|
|
"title": row["title"],
|
|
"summary": row["analysis_summary"] or row["summary"],
|
|
"url": row["article_url"],
|
|
"related": _json_loads(related),
|
|
"category": row["category"] if isinstance(row, sqlite3.Row) and "category" in row.keys() else "",
|
|
"relevance": row["relevance"],
|
|
"sentiment": row["sentiment"],
|
|
"key_discussion": row["key_discussion"],
|
|
"reason_growth": row["reason_growth"],
|
|
"reason_decrease": row["reason_decrease"],
|
|
"ret_t0": row["ret_t0"],
|
|
"ret_t1": row["ret_t1"],
|
|
"ret_t3": row["ret_t3"],
|
|
"ret_t5": row["ret_t5"],
|
|
"ret_t10": row["ret_t10"],
|
|
"analysis_source": row["analysis_source"],
|
|
"analysis_model_label": analysis_meta.get("model_label") if isinstance(analysis_meta, dict) else None,
|
|
}
|
|
|
|
def get_news_items_enriched(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
trade_date: str | None = None,
|
|
limit: int = 100,
|
|
) -> list[dict[str, Any]]:
|
|
sql = """
|
|
SELECT nr.id,
|
|
nt.symbol,
|
|
nr.published_utc,
|
|
nt.trade_date,
|
|
nr.publisher,
|
|
nr.source AS raw_source,
|
|
nr.title,
|
|
nr.summary,
|
|
nr.article_url,
|
|
nr.related_json,
|
|
na.relevance,
|
|
na.sentiment,
|
|
na.key_discussion,
|
|
na.summary AS analysis_summary,
|
|
na.reason_growth,
|
|
na.reason_decrease,
|
|
na.ret_t0,
|
|
na.ret_t1,
|
|
na.ret_t3,
|
|
na.ret_t5,
|
|
na.ret_t10,
|
|
na.analysis_source,
|
|
na.raw_json AS analysis_raw_json
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
|
|
WHERE nt.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if trade_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) = ?"
|
|
params.append(trade_date)
|
|
else:
|
|
if start_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?"
|
|
params.append(max(1, int(limit)))
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return [self._normalize_enriched_news_row(row) for row in rows]
|
|
|
|
def get_news_items(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
limit: int = 100,
|
|
) -> list[dict[str, Any]]:
|
|
sql = """
|
|
SELECT nr.id,
|
|
nt.symbol,
|
|
nr.published_utc,
|
|
nt.trade_date,
|
|
nr.publisher,
|
|
nr.title,
|
|
nr.summary,
|
|
nr.article_url,
|
|
nr.related_json
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE nt.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if start_date:
|
|
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
sql += " ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC LIMIT ?"
|
|
params.append(max(1, int(limit)))
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return [
|
|
{
|
|
"id": row["id"],
|
|
"ticker": row["symbol"],
|
|
"date": row["published_utc"] or row["trade_date"],
|
|
"trade_date": row["trade_date"],
|
|
"source": row["publisher"] or "polygon",
|
|
"title": row["title"],
|
|
"summary": row["summary"],
|
|
"url": row["article_url"],
|
|
"related": _json_loads(row["related_json"]),
|
|
"category": "",
|
|
}
|
|
for row in rows
|
|
]
|
|
|
|
def get_news_timeline(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
sql = """
|
|
SELECT COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) AS date,
|
|
COUNT(*) AS count,
|
|
COUNT(DISTINCT nr.publisher) AS source_count,
|
|
MAX(nr.title) AS top_title
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE nt.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if start_date:
|
|
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
sql += """
|
|
GROUP BY COALESCE(nt.trade_date, substr(nr.published_utc, 1, 10))
|
|
ORDER BY date ASC
|
|
"""
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return [
|
|
{
|
|
"date": row["date"],
|
|
"count": int(row["count"] or 0),
|
|
"source_count": int(row["source_count"] or 0),
|
|
"top_title": row["top_title"] or "",
|
|
}
|
|
for row in rows
|
|
if row["date"]
|
|
]
|
|
|
|
def get_news_timeline_enriched(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
sql = """
|
|
SELECT COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) AS date,
|
|
COUNT(*) AS count,
|
|
COUNT(DISTINCT nr.publisher) AS source_count,
|
|
MAX(nr.title) AS top_title,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'positive' THEN 1 ELSE 0 END) AS positive_count,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) = 'negative' THEN 1 ELSE 0 END) AS negative_count,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.sentiment, '')) IN ('neutral', '') OR na.sentiment IS NULL THEN 1 ELSE 0 END) AS neutral_count,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.relevance, '')) IN ('high', 'relevant') THEN 1 ELSE 0 END) AS high_relevance_count
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
|
|
WHERE nt.symbol = ?
|
|
"""
|
|
params: list[Any] = [symbol]
|
|
if start_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
sql += """
|
|
GROUP BY COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10))
|
|
ORDER BY date ASC
|
|
"""
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
return [
|
|
{
|
|
"date": row["date"],
|
|
"count": int(row["count"] or 0),
|
|
"source_count": int(row["source_count"] or 0),
|
|
"top_title": row["top_title"] or "",
|
|
"positive_count": int(row["positive_count"] or 0),
|
|
"negative_count": int(row["negative_count"] or 0),
|
|
"neutral_count": int(row["neutral_count"] or 0),
|
|
"high_relevance_count": int(row["high_relevance_count"] or 0),
|
|
}
|
|
for row in rows
|
|
if row["date"]
|
|
]
|
|
|
|
def get_news_by_ids(self, symbol: str, article_ids: Iterable[str]) -> list[dict[str, Any]]:
|
|
ids = [str(item).strip() for item in article_ids if str(item).strip()]
|
|
if not ids:
|
|
return []
|
|
placeholders = ",".join("?" for _ in ids)
|
|
sql = f"""
|
|
SELECT nr.id,
|
|
nt.symbol,
|
|
nr.published_utc,
|
|
nt.trade_date,
|
|
nr.publisher,
|
|
nr.title,
|
|
nr.summary,
|
|
nr.article_url,
|
|
nr.related_json
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE nt.symbol = ? AND nr.id IN ({placeholders})
|
|
ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC
|
|
"""
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, [symbol, *ids]).fetchall()
|
|
return [
|
|
{
|
|
"id": row["id"],
|
|
"ticker": row["symbol"],
|
|
"date": row["published_utc"] or row["trade_date"],
|
|
"trade_date": row["trade_date"],
|
|
"source": row["publisher"] or "polygon",
|
|
"title": row["title"],
|
|
"summary": row["summary"],
|
|
"url": row["article_url"],
|
|
"related": _json_loads(row["related_json"]),
|
|
"category": "",
|
|
}
|
|
for row in rows
|
|
]
|
|
|
|
def get_news_by_ids_enriched(
|
|
self,
|
|
symbol: str,
|
|
article_ids: Iterable[str],
|
|
) -> list[dict[str, Any]]:
|
|
ids = [str(item).strip() for item in article_ids if str(item).strip()]
|
|
if not ids:
|
|
return []
|
|
placeholders = ",".join("?" for _ in ids)
|
|
sql = f"""
|
|
SELECT nr.id,
|
|
nt.symbol,
|
|
nr.published_utc,
|
|
nt.trade_date,
|
|
nr.publisher,
|
|
nr.source AS raw_source,
|
|
nr.title,
|
|
nr.summary,
|
|
nr.article_url,
|
|
nr.related_json,
|
|
na.relevance,
|
|
na.sentiment,
|
|
na.key_discussion,
|
|
na.summary AS analysis_summary,
|
|
na.reason_growth,
|
|
na.reason_decrease,
|
|
na.ret_t0,
|
|
na.ret_t1,
|
|
na.ret_t3,
|
|
na.ret_t5,
|
|
na.ret_t10,
|
|
na.analysis_source,
|
|
na.raw_json AS analysis_raw_json
|
|
FROM news_ticker nt
|
|
JOIN news_raw nr ON nr.id = nt.news_id
|
|
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
|
|
WHERE nt.symbol = ? AND nr.id IN ({placeholders})
|
|
ORDER BY COALESCE(nr.published_utc, nt.trade_date) DESC
|
|
"""
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, [symbol, *ids]).fetchall()
|
|
return [self._normalize_enriched_news_row(row) for row in rows]
|
|
|
|
def get_news_categories_enriched(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
limit: int = 200,
|
|
) -> dict[str, dict[str, Any]]:
|
|
rows = self.get_news_items_enriched(
|
|
symbol,
|
|
start_date=start_date,
|
|
end_date=end_date,
|
|
limit=limit,
|
|
)
|
|
categories: dict[str, dict[str, Any]] = {}
|
|
keyword_map = {
|
|
"market": [
|
|
"market", "stock", "rally", "sell-off", "selloff", "trading",
|
|
"wall street", "s&p", "nasdaq", "dow", "index", "bull", "bear",
|
|
"correction", "volatility",
|
|
],
|
|
"policy": [
|
|
"regulation", "fed", "federal reserve", "tariff", "sanction",
|
|
"interest rate", "policy", "government", "congress", "sec",
|
|
"trade war", "ban", "legislation", "tax",
|
|
],
|
|
"earnings": [
|
|
"earnings", "revenue", "profit", "quarter", "eps", "guidance",
|
|
"forecast", "income", "sales", "beat", "miss", "outlook",
|
|
"financial results",
|
|
],
|
|
"product_tech": [
|
|
"product", "ai", "chip", "cloud", "launch", "patent",
|
|
"technology", "innovation", "release", "platform", "model",
|
|
"software", "hardware", "gpu", "autonomous",
|
|
],
|
|
"competition": [
|
|
"competitor", "rival", "market share", "overtake", "compete",
|
|
"competition", "vs", "versus", "battle", "challenge",
|
|
],
|
|
"management": [
|
|
"ceo", "executive", "resign", "layoff", "restructure",
|
|
"management", "leadership", "appoint", "hire", "board",
|
|
"chairman",
|
|
],
|
|
}
|
|
for key in keyword_map:
|
|
categories[key] = {
|
|
"label": key,
|
|
"count": 0,
|
|
"article_ids": [],
|
|
"positive_ids": [],
|
|
"negative_ids": [],
|
|
"neutral_ids": [],
|
|
}
|
|
for row in rows:
|
|
text = " ".join(
|
|
str(row.get(field) or "")
|
|
for field in (
|
|
"title",
|
|
"summary",
|
|
"key_discussion",
|
|
"reason_growth",
|
|
"reason_decrease",
|
|
)
|
|
).lower()
|
|
sentiment = str(row.get("sentiment") or "").strip().lower()
|
|
for category, keywords in keyword_map.items():
|
|
if not any(keyword in text for keyword in keywords):
|
|
continue
|
|
bucket = categories[category]
|
|
bucket["count"] += 1
|
|
bucket["article_ids"].append(row["id"])
|
|
if sentiment == "positive":
|
|
bucket["positive_ids"].append(row["id"])
|
|
elif sentiment == "negative":
|
|
bucket["negative_ids"].append(row["id"])
|
|
else:
|
|
bucket["neutral_ids"].append(row["id"])
|
|
return categories
|
|
|
|
def get_story_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
as_of_date: str,
|
|
) -> dict[str, Any] | None:
|
|
with self._connect() as conn:
|
|
row = conn.execute(
|
|
"""
|
|
SELECT symbol, as_of_date, content, source, created_at, updated_at
|
|
FROM story_cache
|
|
WHERE symbol = ? AND as_of_date = ?
|
|
""",
|
|
(symbol, as_of_date),
|
|
).fetchone()
|
|
return dict(row) if row else None
|
|
|
|
def upsert_story_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
as_of_date: str,
|
|
content: str,
|
|
source: str = "local",
|
|
) -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO story_cache
|
|
(symbol, as_of_date, content, source, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(symbol, as_of_date) DO UPDATE SET
|
|
content = excluded.content,
|
|
source = excluded.source,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(symbol, as_of_date, content, source, timestamp, timestamp),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def delete_story_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
as_of_date: str | None = None,
|
|
) -> int:
|
|
with self._connect() as conn:
|
|
if as_of_date:
|
|
result = conn.execute(
|
|
"""
|
|
DELETE FROM story_cache
|
|
WHERE symbol = ? AND as_of_date = ?
|
|
""",
|
|
(symbol, as_of_date),
|
|
)
|
|
else:
|
|
result = conn.execute(
|
|
"""
|
|
DELETE FROM story_cache
|
|
WHERE symbol = ?
|
|
""",
|
|
(symbol,),
|
|
)
|
|
return int(result.rowcount or 0)
|
|
|
|
def get_similar_day_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
target_date: str,
|
|
) -> dict[str, Any] | None:
|
|
with self._connect() as conn:
|
|
row = conn.execute(
|
|
"""
|
|
SELECT symbol, target_date, payload_json, source, created_at, updated_at
|
|
FROM similar_day_cache
|
|
WHERE symbol = ? AND target_date = ?
|
|
""",
|
|
(symbol, target_date),
|
|
).fetchone()
|
|
if not row:
|
|
return None
|
|
return {
|
|
"symbol": row["symbol"],
|
|
"target_date": row["target_date"],
|
|
"payload": _json_loads(row["payload_json"]),
|
|
"source": row["source"],
|
|
"created_at": row["created_at"],
|
|
"updated_at": row["updated_at"],
|
|
}
|
|
|
|
def upsert_similar_day_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
target_date: str,
|
|
payload: dict[str, Any],
|
|
source: str = "local",
|
|
) -> int:
|
|
timestamp = _utc_timestamp()
|
|
count = 0
|
|
with self._connect() as conn:
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO similar_day_cache
|
|
(symbol, target_date, payload_json, source, created_at, updated_at)
|
|
VALUES (?, ?, ?, ?, ?, ?)
|
|
ON CONFLICT(symbol, target_date) DO UPDATE SET
|
|
payload_json = excluded.payload_json,
|
|
source = excluded.source,
|
|
updated_at = excluded.updated_at
|
|
""",
|
|
(symbol, target_date, _json_dumps(payload), source, timestamp, timestamp),
|
|
)
|
|
count += 1
|
|
return count
|
|
|
|
def delete_similar_day_cache(
|
|
self,
|
|
symbol: str,
|
|
*,
|
|
target_date: str | None = None,
|
|
) -> int:
|
|
with self._connect() as conn:
|
|
if target_date:
|
|
result = conn.execute(
|
|
"""
|
|
DELETE FROM similar_day_cache
|
|
WHERE symbol = ? AND target_date = ?
|
|
""",
|
|
(symbol, target_date),
|
|
)
|
|
else:
|
|
result = conn.execute(
|
|
"""
|
|
DELETE FROM similar_day_cache
|
|
WHERE symbol = ?
|
|
""",
|
|
(symbol,),
|
|
)
|
|
return int(result.rowcount or 0)
|
|
|
|
def get_enrich_report(
|
|
self,
|
|
symbols: list[str] | None = None,
|
|
*,
|
|
start_date: str | None = None,
|
|
end_date: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
"""Summarize explain enrichment coverage and freshness per ticker."""
|
|
sql = """
|
|
SELECT nt.symbol AS symbol,
|
|
COUNT(DISTINCT nt.news_id) AS raw_news_count,
|
|
COUNT(DISTINCT na.news_id) AS analyzed_news_count,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'llm' THEN 1 ELSE 0 END) AS llm_count,
|
|
SUM(CASE WHEN LOWER(COALESCE(na.analysis_source, '')) = 'local' THEN 1 ELSE 0 END) AS local_count,
|
|
MAX(na.updated_at) AS latest_analysis_at,
|
|
MAX(nt.trade_date) AS latest_trade_date
|
|
FROM news_ticker nt
|
|
LEFT JOIN news_analysis na ON na.news_id = nt.news_id AND na.symbol = nt.symbol
|
|
LEFT JOIN news_raw nr ON nr.id = nt.news_id
|
|
WHERE 1 = 1
|
|
"""
|
|
params: list[Any] = []
|
|
if symbols:
|
|
normalized = [str(symbol).strip().upper() for symbol in symbols if str(symbol).strip()]
|
|
if normalized:
|
|
placeholders = ",".join("?" for _ in normalized)
|
|
sql += f" AND nt.symbol IN ({placeholders})"
|
|
params.extend(normalized)
|
|
if start_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) >= ?"
|
|
params.append(start_date)
|
|
if end_date:
|
|
sql += " AND COALESCE(nt.trade_date, na.trade_date, substr(nr.published_utc, 1, 10)) <= ?"
|
|
params.append(end_date)
|
|
sql += " GROUP BY nt.symbol ORDER BY nt.symbol ASC"
|
|
|
|
with self._connect() as conn:
|
|
rows = conn.execute(sql, params).fetchall()
|
|
|
|
report: list[dict[str, Any]] = []
|
|
for row in rows:
|
|
raw_news_count = int(row["raw_news_count"] or 0)
|
|
analyzed_news_count = int(row["analyzed_news_count"] or 0)
|
|
coverage_pct = (
|
|
round((analyzed_news_count / raw_news_count) * 100, 1)
|
|
if raw_news_count > 0
|
|
else 0.0
|
|
)
|
|
report.append(
|
|
{
|
|
"symbol": row["symbol"],
|
|
"raw_news_count": raw_news_count,
|
|
"analyzed_news_count": analyzed_news_count,
|
|
"coverage_pct": coverage_pct,
|
|
"llm_count": int(row["llm_count"] or 0),
|
|
"local_count": int(row["local_count"] or 0),
|
|
"latest_analysis_at": row["latest_analysis_at"],
|
|
"latest_trade_date": row["latest_trade_date"],
|
|
}
|
|
)
|
|
return report
|