Initial commit of integrated agent system
This commit is contained in:
202
backend/explain/similarity_service.py
Normal file
202
backend/explain/similarity_service.py
Normal file
@@ -0,0 +1,202 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Same-ticker historical similar day search for explain view."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from math import sqrt
|
||||
from typing import Any
|
||||
|
||||
from backend.data.market_store import MarketStore
|
||||
|
||||
|
||||
def _safe_float(value: Any, default: float = 0.0) -> float:
|
||||
try:
|
||||
parsed = float(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
return parsed
|
||||
|
||||
|
||||
def build_daily_feature_rows(
|
||||
*,
|
||||
symbol: str,
|
||||
ohlc_rows: list[dict[str, Any]],
|
||||
news_rows: list[dict[str, Any]],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Aggregate price/news context into daily feature rows."""
|
||||
price_by_date = {str(row.get("date")): row for row in ohlc_rows if row.get("date")}
|
||||
ordered_dates = [str(row.get("date")) for row in ohlc_rows if row.get("date")]
|
||||
|
||||
news_by_date: dict[str, list[dict[str, Any]]] = {}
|
||||
for row in news_rows:
|
||||
trade_date = str(row.get("trade_date") or "")[:10] or str(row.get("date") or "")[:10]
|
||||
if not trade_date:
|
||||
continue
|
||||
news_by_date.setdefault(trade_date, []).append(row)
|
||||
|
||||
features: list[dict[str, Any]] = []
|
||||
previous_close: float | None = None
|
||||
for idx, date in enumerate(ordered_dates):
|
||||
price_row = price_by_date[date]
|
||||
close_price = _safe_float(price_row.get("close"))
|
||||
open_price = _safe_float(price_row.get("open"), close_price)
|
||||
day_news = news_by_date.get(date, [])
|
||||
positive_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "positive")
|
||||
negative_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "negative")
|
||||
high_relevance_count = sum(
|
||||
1 for item in day_news if str(item.get("relevance") or "").lower() in {"high", "relevant"}
|
||||
)
|
||||
ret_1d = (
|
||||
((close_price - previous_close) / previous_close)
|
||||
if previous_close not in (None, 0)
|
||||
else 0.0
|
||||
)
|
||||
intraday_ret = ((close_price - open_price) / open_price) if open_price else 0.0
|
||||
sentiment_score = (
|
||||
(positive_count - negative_count) / max(len(day_news), 1)
|
||||
if day_news
|
||||
else 0.0
|
||||
)
|
||||
future_t1 = None
|
||||
future_t3 = None
|
||||
if idx + 1 < len(ordered_dates) and close_price:
|
||||
next_close = _safe_float(price_by_date[ordered_dates[idx + 1]].get("close"))
|
||||
future_t1 = ((next_close - close_price) / close_price) if next_close else None
|
||||
if idx + 3 < len(ordered_dates) and close_price:
|
||||
next_close = _safe_float(price_by_date[ordered_dates[idx + 3]].get("close"))
|
||||
future_t3 = ((next_close - close_price) / close_price) if next_close else None
|
||||
|
||||
features.append(
|
||||
{
|
||||
"date": date,
|
||||
"symbol": symbol,
|
||||
"n_articles": len(day_news),
|
||||
"positive_count": positive_count,
|
||||
"negative_count": negative_count,
|
||||
"high_relevance_count": high_relevance_count,
|
||||
"sentiment_score": sentiment_score,
|
||||
"ret_1d": ret_1d,
|
||||
"intraday_ret": intraday_ret,
|
||||
"close": close_price,
|
||||
"ret_t1_after": future_t1,
|
||||
"ret_t3_after": future_t3,
|
||||
"news": [
|
||||
{
|
||||
"title": row.get("title") or "",
|
||||
"sentiment": row.get("sentiment") or "neutral",
|
||||
}
|
||||
for row in day_news[:3]
|
||||
],
|
||||
}
|
||||
)
|
||||
previous_close = close_price
|
||||
return features
|
||||
|
||||
|
||||
def compute_similarity_scores(
|
||||
target_vector: list[float],
|
||||
candidate_vectors: list[tuple[str, list[float], dict[str, Any]]],
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Return sorted similarity matches based on normalized Euclidean distance."""
|
||||
if not candidate_vectors:
|
||||
return []
|
||||
dimensions = len(target_vector)
|
||||
ranges = []
|
||||
for dimension in range(dimensions):
|
||||
values = [vector[1][dimension] for vector in candidate_vectors] + [target_vector[dimension]]
|
||||
min_value = min(values)
|
||||
max_value = max(values)
|
||||
ranges.append(max(max_value - min_value, 1e-9))
|
||||
|
||||
scored = []
|
||||
for date, vector, payload in candidate_vectors:
|
||||
distance = sqrt(
|
||||
sum(
|
||||
((target_vector[i] - vector[i]) / ranges[i]) ** 2
|
||||
for i in range(dimensions)
|
||||
)
|
||||
)
|
||||
similarity = 1.0 / (1.0 + distance)
|
||||
scored.append(
|
||||
{
|
||||
"date": date,
|
||||
"score": round(similarity, 4),
|
||||
**payload,
|
||||
}
|
||||
)
|
||||
return sorted(scored, key=lambda item: item["score"], reverse=True)
|
||||
|
||||
|
||||
def find_similar_days(
|
||||
store: MarketStore,
|
||||
*,
|
||||
symbol: str,
|
||||
target_date: str,
|
||||
top_k: int = 10,
|
||||
) -> dict[str, Any]:
|
||||
"""Find same-ticker historical days most similar to a target day."""
|
||||
cached = store.get_similar_day_cache(symbol, target_date=target_date)
|
||||
if cached and cached.get("payload"):
|
||||
return cached["payload"]
|
||||
|
||||
ohlc_rows = store.get_ohlc(symbol, "1900-01-01", target_date)
|
||||
news_rows = store.get_news_items_enriched(symbol, end_date=target_date, limit=500)
|
||||
daily_rows = build_daily_feature_rows(symbol=symbol, ohlc_rows=ohlc_rows, news_rows=news_rows)
|
||||
feature_map = {row["date"]: row for row in daily_rows}
|
||||
target_row = feature_map.get(target_date)
|
||||
if not target_row:
|
||||
return {
|
||||
"symbol": symbol,
|
||||
"target_date": target_date,
|
||||
"items": [],
|
||||
"error": "No feature row for target date",
|
||||
}
|
||||
|
||||
vector_keys = [
|
||||
"sentiment_score",
|
||||
"n_articles",
|
||||
"positive_count",
|
||||
"negative_count",
|
||||
"high_relevance_count",
|
||||
"ret_1d",
|
||||
"intraday_ret",
|
||||
]
|
||||
target_vector = [_safe_float(target_row.get(key)) for key in vector_keys]
|
||||
candidates = []
|
||||
for row in daily_rows:
|
||||
date = row["date"]
|
||||
if date == target_date:
|
||||
continue
|
||||
payload = {
|
||||
"n_articles": row["n_articles"],
|
||||
"sentiment_score": round(row["sentiment_score"], 4),
|
||||
"ret_1d": round(row["ret_1d"] * 100, 2),
|
||||
"intraday_ret": round(row["intraday_ret"] * 100, 2),
|
||||
"ret_t1_after": round(row["ret_t1_after"] * 100, 2) if row["ret_t1_after"] is not None else None,
|
||||
"ret_t3_after": round(row["ret_t3_after"] * 100, 2) if row["ret_t3_after"] is not None else None,
|
||||
"top_reasons": [item["title"] for item in row["news"][:2] if item.get("title")],
|
||||
"news": row["news"],
|
||||
}
|
||||
candidates.append(
|
||||
(
|
||||
date,
|
||||
[_safe_float(row.get(key)) for key in vector_keys],
|
||||
payload,
|
||||
)
|
||||
)
|
||||
|
||||
items = compute_similarity_scores(target_vector, candidates)[: max(1, min(int(top_k), 20))]
|
||||
result = {
|
||||
"symbol": symbol,
|
||||
"target_date": target_date,
|
||||
"target_features": {
|
||||
"sentiment_score": round(target_row["sentiment_score"], 4),
|
||||
"n_articles": target_row["n_articles"],
|
||||
"ret_1d": round(target_row["ret_1d"] * 100, 2),
|
||||
"intraday_ret": round(target_row["intraday_ret"] * 100, 2),
|
||||
"high_relevance_count": target_row["high_relevance_count"],
|
||||
},
|
||||
"items": items,
|
||||
}
|
||||
store.upsert_similar_day_cache(symbol, target_date=target_date, payload=result, source="local")
|
||||
return result
|
||||
Reference in New Issue
Block a user