# -*- coding: utf-8 -*- """Same-ticker historical similar day search for explain view.""" from __future__ import annotations from math import sqrt from typing import Any from backend.data.market_store import MarketStore def _safe_float(value: Any, default: float = 0.0) -> float: try: parsed = float(value) except (TypeError, ValueError): return default return parsed def build_daily_feature_rows( *, symbol: str, ohlc_rows: list[dict[str, Any]], news_rows: list[dict[str, Any]], ) -> list[dict[str, Any]]: """Aggregate price/news context into daily feature rows.""" price_by_date = {str(row.get("date")): row for row in ohlc_rows if row.get("date")} ordered_dates = [str(row.get("date")) for row in ohlc_rows if row.get("date")] news_by_date: dict[str, list[dict[str, Any]]] = {} for row in news_rows: trade_date = str(row.get("trade_date") or "")[:10] or str(row.get("date") or "")[:10] if not trade_date: continue news_by_date.setdefault(trade_date, []).append(row) features: list[dict[str, Any]] = [] previous_close: float | None = None for idx, date in enumerate(ordered_dates): price_row = price_by_date[date] close_price = _safe_float(price_row.get("close")) open_price = _safe_float(price_row.get("open"), close_price) day_news = news_by_date.get(date, []) positive_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "positive") negative_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "negative") high_relevance_count = sum( 1 for item in day_news if str(item.get("relevance") or "").lower() in {"high", "relevant"} ) ret_1d = ( ((close_price - previous_close) / previous_close) if previous_close not in (None, 0) else 0.0 ) intraday_ret = ((close_price - open_price) / open_price) if open_price else 0.0 sentiment_score = ( (positive_count - negative_count) / max(len(day_news), 1) if day_news else 0.0 ) future_t1 = None future_t3 = None if idx + 1 < len(ordered_dates) and close_price: next_close = _safe_float(price_by_date[ordered_dates[idx + 1]].get("close")) future_t1 = ((next_close - close_price) / close_price) if next_close else None if idx + 3 < len(ordered_dates) and close_price: next_close = _safe_float(price_by_date[ordered_dates[idx + 3]].get("close")) future_t3 = ((next_close - close_price) / close_price) if next_close else None features.append( { "date": date, "symbol": symbol, "n_articles": len(day_news), "positive_count": positive_count, "negative_count": negative_count, "high_relevance_count": high_relevance_count, "sentiment_score": sentiment_score, "ret_1d": ret_1d, "intraday_ret": intraday_ret, "close": close_price, "ret_t1_after": future_t1, "ret_t3_after": future_t3, "news": [ { "title": row.get("title") or "", "sentiment": row.get("sentiment") or "neutral", } for row in day_news[:3] ], } ) previous_close = close_price return features def compute_similarity_scores( target_vector: list[float], candidate_vectors: list[tuple[str, list[float], dict[str, Any]]], ) -> list[dict[str, Any]]: """Return sorted similarity matches based on normalized Euclidean distance.""" if not candidate_vectors: return [] dimensions = len(target_vector) ranges = [] for dimension in range(dimensions): values = [vector[1][dimension] for vector in candidate_vectors] + [target_vector[dimension]] min_value = min(values) max_value = max(values) ranges.append(max(max_value - min_value, 1e-9)) scored = [] for date, vector, payload in candidate_vectors: distance = sqrt( sum( ((target_vector[i] - vector[i]) / ranges[i]) ** 2 for i in range(dimensions) ) ) similarity = 1.0 / (1.0 + distance) scored.append( { "date": date, "score": round(similarity, 4), **payload, } ) return sorted(scored, key=lambda item: item["score"], reverse=True) def find_similar_days( store: MarketStore, *, symbol: str, target_date: str, top_k: int = 10, ) -> dict[str, Any]: """Find same-ticker historical days most similar to a target day.""" cached = store.get_similar_day_cache(symbol, target_date=target_date) if cached and cached.get("payload"): return cached["payload"] ohlc_rows = store.get_ohlc(symbol, "1900-01-01", target_date) news_rows = store.get_news_items_enriched(symbol, end_date=target_date, limit=500) daily_rows = build_daily_feature_rows(symbol=symbol, ohlc_rows=ohlc_rows, news_rows=news_rows) feature_map = {row["date"]: row for row in daily_rows} target_row = feature_map.get(target_date) if not target_row: return { "symbol": symbol, "target_date": target_date, "items": [], "error": "No feature row for target date", } vector_keys = [ "sentiment_score", "n_articles", "positive_count", "negative_count", "high_relevance_count", "ret_1d", "intraday_ret", ] target_vector = [_safe_float(target_row.get(key)) for key in vector_keys] candidates = [] for row in daily_rows: date = row["date"] if date == target_date: continue payload = { "n_articles": row["n_articles"], "sentiment_score": round(row["sentiment_score"], 4), "ret_1d": round(row["ret_1d"] * 100, 2), "intraday_ret": round(row["intraday_ret"] * 100, 2), "ret_t1_after": round(row["ret_t1_after"] * 100, 2) if row["ret_t1_after"] is not None else None, "ret_t3_after": round(row["ret_t3_after"] * 100, 2) if row["ret_t3_after"] is not None else None, "top_reasons": [item["title"] for item in row["news"][:2] if item.get("title")], "news": row["news"], } candidates.append( ( date, [_safe_float(row.get(key)) for key in vector_keys], payload, ) ) items = compute_similarity_scores(target_vector, candidates)[: max(1, min(int(top_k), 20))] result = { "symbol": symbol, "target_date": target_date, "target_features": { "sentiment_score": round(target_row["sentiment_score"], 4), "n_articles": target_row["n_articles"], "ret_1d": round(target_row["ret_1d"] * 100, 2), "intraday_ret": round(target_row["intraday_ret"] * 100, 2), "high_relevance_count": target_row["high_relevance_count"], }, "items": items, } store.upsert_similar_day_cache(symbol, target_date=target_date, payload=result, source="local") return result