Initial commit of integrated agent system

2026-03-30 17:46:44 +08:00
commit 0fa413380c
337 changed files with 75268 additions and 0 deletions
--- a/backend/explain/init.py
+++ b/backend/explain/init.py
@@ -0,0 +1,2 @@
+# -*- coding: utf-8 -*-
+"""Explain-oriented services for stock narratives and news research."""
--- a/backend/explain/category_engine.py
+++ b/backend/explain/category_engine.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+"""Rule-based news categorization for explain UI."""
+
+from __future__ import annotations
+
+from typing import Any, Dict, Iterable
+
+
+CATEGORY_KEYWORDS = {
+    "market": [
+        "market", "stock", "rally", "sell-off", "selloff", "trading",
+        "wall street", "s&p", "nasdaq", "dow", "index", "bull", "bear",
+        "correction", "volatility",
+    ],
+    "policy": [
+        "regulation", "fed", "federal reserve", "tariff", "sanction",
+        "interest rate", "policy", "government", "congress", "sec",
+        "trade war", "ban", "legislation", "tax",
+    ],
+    "earnings": [
+        "earnings", "revenue", "profit", "quarter", "eps", "guidance",
+        "forecast", "income", "sales", "beat", "miss", "outlook",
+        "financial results",
+    ],
+    "product_tech": [
+        "product", "ai", "chip", "cloud", "launch", "patent",
+        "technology", "innovation", "release", "platform", "model",
+        "software", "hardware", "gpu", "autonomous",
+    ],
+    "competition": [
+        "competitor", "rival", "market share", "overtake", "compete",
+        "competition", "vs", "versus", "battle", "challenge",
+    ],
+    "management": [
+        "ceo", "executive", "resign", "layoff", "restructure",
+        "management", "leadership", "appoint", "hire", "board",
+        "chairman",
+    ],
+}
+
+
+def categorize_news_rows(rows: Iterable[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    """Bucket news rows by keyword categories."""
+    categories: Dict[str, Dict[str, Any]] = {
+        key: {
+            "label": key,
+            "count": 0,
+            "article_ids": [],
+        }
+        for key in CATEGORY_KEYWORDS
+    }
+
+    for row in rows:
+        text = " ".join(
+            [
+                str(row.get("title") or ""),
+                str(row.get("summary") or ""),
+                str(row.get("related") or ""),
+                str(row.get("category") or ""),
+            ]
+        ).lower()
+        article_id = row.get("id")
+        for category, keywords in CATEGORY_KEYWORDS.items():
+            if any(keyword in text for keyword in keywords):
+                categories[category]["count"] += 1
+                if article_id:
+                    categories[category]["article_ids"].append(article_id)
+
+    return categories
--- a/backend/explain/range_explainer.py
+++ b/backend/explain/range_explainer.py
@@ -0,0 +1,214 @@
+# -*- coding: utf-8 -*-
+"""Local range explanation built from price and persisted news."""
+
+from __future__ import annotations
+
+from typing import Any, Dict
+
+from backend.enrich.llm_enricher import analyze_range_with_llm
+from backend.explain.category_engine import categorize_news_rows
+from backend.tools.data_tools import get_prices
+
+
+def _rank_event_score(row: Dict[str, Any]) -> float:
+    relevance = str(row.get("relevance") or "").strip().lower()
+    relevance_score = {"high": 3.0, "relevant": 3.0, "medium": 2.0, "low": 1.0}.get(
+        relevance,
+        0.5,
+    )
+    impact_score = abs(float(row.get("ret_t0") or 0.0)) * 100
+    return relevance_score + impact_score
+
+
+def summarize_bullish_factors(
+    news_rows: list[Dict[str, Any]],
+    *,
+    limit: int = 5,
+) -> list[str]:
+    factors = []
+    for row in news_rows:
+        if str(row.get("sentiment") or "").strip().lower() != "positive":
+            continue
+        candidate = row.get("reason_growth") or row.get("key_discussion") or row.get("summary") or row.get("title")
+        if candidate:
+            factors.append(str(candidate).strip())
+    seen = set()
+    output = []
+    for factor in factors:
+        if factor in seen:
+            continue
+        seen.add(factor)
+        output.append(factor[:200])
+        if len(output) >= limit:
+            break
+    return output
+
+
+def summarize_bearish_factors(
+    news_rows: list[Dict[str, Any]],
+    *,
+    limit: int = 5,
+) -> list[str]:
+    factors = []
+    for row in news_rows:
+        if str(row.get("sentiment") or "").strip().lower() != "negative":
+            continue
+        candidate = row.get("reason_decrease") or row.get("key_discussion") or row.get("summary") or row.get("title")
+        if candidate:
+            factors.append(str(candidate).strip())
+    seen = set()
+    output = []
+    for factor in factors:
+        if factor in seen:
+            continue
+        seen.add(factor)
+        output.append(factor[:200])
+        if len(output) >= limit:
+            break
+    return output
+
+
+def build_trend_analysis(prices: list[Any]) -> str:
+    if len(prices) < 2:
+        return "区间样本较短，暂不具备足够趋势信息。"
+    if len(prices) < 3:
+        open_price = float(prices[0].open)
+        close_price = float(prices[-1].close)
+        change = ((close_price - open_price) / open_price) * 100 if open_price else 0.0
+        return f"短区间内价格变动 {change:+.2f}%，趋势信息有限。"
+
+    mid = len(prices) // 2
+    first_open = float(prices[0].open)
+    first_close = float(prices[mid].close)
+    second_open = float(prices[mid].open)
+    second_close = float(prices[-1].close)
+    first_half = ((first_close - first_open) / first_open) * 100 if first_open else 0.0
+    second_half = ((second_close - second_open) / second_open) * 100 if second_open else 0.0
+    return (
+        f"前半段{'上涨' if first_half >= 0 else '下跌'} {abs(first_half):.2f}%，"
+        f"后半段{'上涨' if second_half >= 0 else '下跌'} {abs(second_half):.2f}%，"
+        "说明价格驱动在区间内部出现了阶段性切换。"
+    )
+
+
+def build_range_explanation(
+    *,
+    ticker: str,
+    start_date: str,
+    end_date: str,
+    news_rows: list[Dict[str, Any]],
+) -> Dict[str, Any]:
+    """Explain a price range with local price and news heuristics."""
+    prices = get_prices(ticker, start_date, end_date)
+    if not prices:
+        return {
+            "symbol": ticker,
+            "start_date": start_date,
+            "end_date": end_date,
+            "error": "No OHLC data for this range",
+        }
+
+    open_price = float(prices[0].open)
+    close_price = float(prices[-1].close)
+    high_price = max(float(price.high) for price in prices)
+    low_price = min(float(price.low) for price in prices)
+    total_volume = sum(int(price.volume) for price in prices)
+    price_change_pct = (
+        ((close_price - open_price) / open_price) * 100 if open_price else 0.0
+    )
+
+    categories = categorize_news_rows(news_rows)
+    news_count = len(news_rows)
+    dominant_categories = sorted(
+        (
+            {"category": key, "count": value["count"]}
+            for key, value in categories.items()
+            if value["count"] > 0
+        ),
+        key=lambda item: item["count"],
+        reverse=True,
+    )
+
+    direction = "上涨" if price_change_pct > 0 else "下跌" if price_change_pct < 0 else "横盘"
+    category_text = (
+        f"主要主题集中在 {', '.join(item['category'] for item in dominant_categories[:3])}。"
+        if dominant_categories
+        else "区间内未识别出明显的主题聚类。"
+    )
+    summary = (
+        f"{ticker} 在 {start_date} 至 {end_date} 区间内{direction} {abs(price_change_pct):.2f}%，"
+        f"区间覆盖 {len(prices)} 个交易日，关联新闻 {news_count} 条。{category_text}"
+    )
+
+    bullish_factors = summarize_bullish_factors(news_rows)
+    bearish_factors = summarize_bearish_factors(news_rows)
+    trend_analysis = build_trend_analysis(prices)
+    llm_source = "local"
+
+    range_payload = {
+        "ticker": ticker,
+        "start_date": start_date,
+        "end_date": end_date,
+        "price_change_pct": round(price_change_pct, 2),
+        "trading_days": len(prices),
+        "news_count": news_count,
+        "dominant_categories": dominant_categories[:5],
+        "bullish_factors": bullish_factors[:3],
+        "bearish_factors": bearish_factors[:3],
+        "trend_analysis": trend_analysis,
+        "top_news": [
+            {
+                "date": row.get("trade_date") or str(row.get("date") or "")[:10],
+                "title": row.get("title") or "",
+                "summary": row.get("summary") or "",
+                "sentiment": row.get("sentiment") or "",
+                "relevance": row.get("relevance") or "",
+                "ret_t0": row.get("ret_t0"),
+            }
+            for row in sorted(news_rows, key=_rank_event_score, reverse=True)[:5]
+        ],
+    }
+    llm_analysis = analyze_range_with_llm(range_payload)
+    if isinstance(llm_analysis, dict):
+        summary = llm_analysis.get("summary") or summary
+        trend_analysis = llm_analysis.get("trend_analysis") or trend_analysis
+        bullish_factors = llm_analysis.get("bullish_factors") or bullish_factors
+        bearish_factors = llm_analysis.get("bearish_factors") or bearish_factors
+        llm_source = "llm"
+
+    key_events = [
+        {
+            "date": row.get("trade_date") or str(row.get("date") or "")[:10],
+            "title": row.get("title") or "Untitled news",
+            "summary": row.get("summary") or "",
+            "category": row.get("category") or "",
+            "id": row.get("id"),
+            "sentiment": row.get("sentiment"),
+            "ret_t0": row.get("ret_t0"),
+        }
+        for row in sorted(news_rows, key=_rank_event_score, reverse=True)[:8]
+    ]
+
+    return {
+        "symbol": ticker,
+        "start_date": start_date,
+        "end_date": end_date,
+        "price_change_pct": round(price_change_pct, 2),
+        "open_price": open_price,
+        "close_price": close_price,
+        "high_price": high_price,
+        "low_price": low_price,
+        "total_volume": total_volume,
+        "trading_days": len(prices),
+        "news_count": news_count,
+        "dominant_categories": dominant_categories[:5],
+        "analysis": {
+            "summary": summary,
+            "key_events": key_events,
+            "bullish_factors": bullish_factors,
+            "bearish_factors": bearish_factors,
+            "trend_analysis": trend_analysis,
+            "analysis_source": llm_source,
+            "analysis_model_label": llm_analysis.get("model_label") if isinstance(llm_analysis, dict) else None,
+        },
+    }
--- a/backend/explain/similarity_service.py
+++ b/backend/explain/similarity_service.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+"""Same-ticker historical similar day search for explain view."""
+
+from __future__ import annotations
+
+from math import sqrt
+from typing import Any
+
+from backend.data.market_store import MarketStore
+
+
+def _safe_float(value: Any, default: float = 0.0) -> float:
+    try:
+        parsed = float(value)
+    except (TypeError, ValueError):
+        return default
+    return parsed
+
+
+def build_daily_feature_rows(
+    *,
+    symbol: str,
+    ohlc_rows: list[dict[str, Any]],
+    news_rows: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Aggregate price/news context into daily feature rows."""
+    price_by_date = {str(row.get("date")): row for row in ohlc_rows if row.get("date")}
+    ordered_dates = [str(row.get("date")) for row in ohlc_rows if row.get("date")]
+
+    news_by_date: dict[str, list[dict[str, Any]]] = {}
+    for row in news_rows:
+        trade_date = str(row.get("trade_date") or "")[:10] or str(row.get("date") or "")[:10]
+        if not trade_date:
+            continue
+        news_by_date.setdefault(trade_date, []).append(row)
+
+    features: list[dict[str, Any]] = []
+    previous_close: float | None = None
+    for idx, date in enumerate(ordered_dates):
+        price_row = price_by_date[date]
+        close_price = _safe_float(price_row.get("close"))
+        open_price = _safe_float(price_row.get("open"), close_price)
+        day_news = news_by_date.get(date, [])
+        positive_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "positive")
+        negative_count = sum(1 for item in day_news if str(item.get("sentiment") or "").lower() == "negative")
+        high_relevance_count = sum(
+            1 for item in day_news if str(item.get("relevance") or "").lower() in {"high", "relevant"}
+        )
+        ret_1d = (
+            ((close_price - previous_close) / previous_close)
+            if previous_close not in (None, 0)
+            else 0.0
+        )
+        intraday_ret = ((close_price - open_price) / open_price) if open_price else 0.0
+        sentiment_score = (
+            (positive_count - negative_count) / max(len(day_news), 1)
+            if day_news
+            else 0.0
+        )
+        future_t1 = None
+        future_t3 = None
+        if idx + 1 < len(ordered_dates) and close_price:
+            next_close = _safe_float(price_by_date[ordered_dates[idx + 1]].get("close"))
+            future_t1 = ((next_close - close_price) / close_price) if next_close else None
+        if idx + 3 < len(ordered_dates) and close_price:
+            next_close = _safe_float(price_by_date[ordered_dates[idx + 3]].get("close"))
+            future_t3 = ((next_close - close_price) / close_price) if next_close else None
+
+        features.append(
+            {
+                "date": date,
+                "symbol": symbol,
+                "n_articles": len(day_news),
+                "positive_count": positive_count,
+                "negative_count": negative_count,
+                "high_relevance_count": high_relevance_count,
+                "sentiment_score": sentiment_score,
+                "ret_1d": ret_1d,
+                "intraday_ret": intraday_ret,
+                "close": close_price,
+                "ret_t1_after": future_t1,
+                "ret_t3_after": future_t3,
+                "news": [
+                    {
+                        "title": row.get("title") or "",
+                        "sentiment": row.get("sentiment") or "neutral",
+                    }
+                    for row in day_news[:3]
+                ],
+            }
+        )
+        previous_close = close_price
+    return features
+
+
+def compute_similarity_scores(
+    target_vector: list[float],
+    candidate_vectors: list[tuple[str, list[float], dict[str, Any]]],
+) -> list[dict[str, Any]]:
+    """Return sorted similarity matches based on normalized Euclidean distance."""
+    if not candidate_vectors:
+        return []
+    dimensions = len(target_vector)
+    ranges = []
+    for dimension in range(dimensions):
+        values = [vector[1][dimension] for vector in candidate_vectors] + [target_vector[dimension]]
+        min_value = min(values)
+        max_value = max(values)
+        ranges.append(max(max_value - min_value, 1e-9))
+
+    scored = []
+    for date, vector, payload in candidate_vectors:
+        distance = sqrt(
+            sum(
+                ((target_vector[i] - vector[i]) / ranges[i]) ** 2
+                for i in range(dimensions)
+            )
+        )
+        similarity = 1.0 / (1.0 + distance)
+        scored.append(
+            {
+                "date": date,
+                "score": round(similarity, 4),
+                **payload,
+            }
+        )
+    return sorted(scored, key=lambda item: item["score"], reverse=True)
+
+
+def find_similar_days(
+    store: MarketStore,
+    *,
+    symbol: str,
+    target_date: str,
+    top_k: int = 10,
+) -> dict[str, Any]:
+    """Find same-ticker historical days most similar to a target day."""
+    cached = store.get_similar_day_cache(symbol, target_date=target_date)
+    if cached and cached.get("payload"):
+        return cached["payload"]
+
+    ohlc_rows = store.get_ohlc(symbol, "1900-01-01", target_date)
+    news_rows = store.get_news_items_enriched(symbol, end_date=target_date, limit=500)
+    daily_rows = build_daily_feature_rows(symbol=symbol, ohlc_rows=ohlc_rows, news_rows=news_rows)
+    feature_map = {row["date"]: row for row in daily_rows}
+    target_row = feature_map.get(target_date)
+    if not target_row:
+        return {
+            "symbol": symbol,
+            "target_date": target_date,
+            "items": [],
+            "error": "No feature row for target date",
+        }
+
+    vector_keys = [
+        "sentiment_score",
+        "n_articles",
+        "positive_count",
+        "negative_count",
+        "high_relevance_count",
+        "ret_1d",
+        "intraday_ret",
+    ]
+    target_vector = [_safe_float(target_row.get(key)) for key in vector_keys]
+    candidates = []
+    for row in daily_rows:
+        date = row["date"]
+        if date == target_date:
+            continue
+        payload = {
+            "n_articles": row["n_articles"],
+            "sentiment_score": round(row["sentiment_score"], 4),
+            "ret_1d": round(row["ret_1d"] * 100, 2),
+            "intraday_ret": round(row["intraday_ret"] * 100, 2),
+            "ret_t1_after": round(row["ret_t1_after"] * 100, 2) if row["ret_t1_after"] is not None else None,
+            "ret_t3_after": round(row["ret_t3_after"] * 100, 2) if row["ret_t3_after"] is not None else None,
+            "top_reasons": [item["title"] for item in row["news"][:2] if item.get("title")],
+            "news": row["news"],
+        }
+        candidates.append(
+            (
+                date,
+                [_safe_float(row.get(key)) for key in vector_keys],
+                payload,
+            )
+        )
+
+    items = compute_similarity_scores(target_vector, candidates)[: max(1, min(int(top_k), 20))]
+    result = {
+        "symbol": symbol,
+        "target_date": target_date,
+        "target_features": {
+            "sentiment_score": round(target_row["sentiment_score"], 4),
+            "n_articles": target_row["n_articles"],
+            "ret_1d": round(target_row["ret_1d"] * 100, 2),
+            "intraday_ret": round(target_row["intraday_ret"] * 100, 2),
+            "high_relevance_count": target_row["high_relevance_count"],
+        },
+        "items": items,
+    }
+    store.upsert_similar_day_cache(symbol, target_date=target_date, payload=result, source="local")
+    return result
--- a/backend/explain/story_service.py
+++ b/backend/explain/story_service.py
@@ -0,0 +1,127 @@
+# -*- coding: utf-8 -*-
+"""Stock story generation for explain view."""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+from typing import Any
+
+from backend.data.market_store import MarketStore
+
+
+def build_stock_story(
+    *,
+    symbol: str,
+    as_of_date: str,
+    price_rows: list[dict[str, Any]],
+    news_rows: list[dict[str, Any]],
+) -> str:
+    """Build a compact markdown story from enriched news and recent price action."""
+    lines = [f"## {symbol} Story", f"As of `{as_of_date}`"]
+    if not price_rows:
+        lines.append("")
+        lines.append("No OHLC data available for story generation.")
+        return "\n".join(lines)
+
+    open_price = float(price_rows[0].get("open") or price_rows[0].get("close") or 0.0)
+    close_price = float(price_rows[-1].get("close") or 0.0)
+    price_change = ((close_price - open_price) / open_price) * 100 if open_price else 0.0
+    high_price = max(float(row.get("high") or row.get("close") or 0.0) for row in price_rows)
+    low_price = min(float(row.get("low") or row.get("close") or 0.0) for row in price_rows)
+
+    lines.append("")
+    lines.append(
+        f"The stock moved {'up' if price_change >= 0 else 'down'} "
+        f"{abs(price_change):.2f}% over the recent window, trading between "
+        f"${low_price:.2f} and ${high_price:.2f}."
+    )
+
+    positive = [row for row in news_rows if str(row.get("sentiment") or "").lower() == "positive"]
+    negative = [row for row in news_rows if str(row.get("sentiment") or "").lower() == "negative"]
+    lines.append("")
+    lines.append(
+        f"Recent coverage included {len(news_rows)} relevant articles "
+        f"({len(positive)} positive / {len(negative)} negative)."
+    )
+
+    if news_rows:
+        lines.append("")
+        lines.append("### Key Moments")
+        ranked_rows = sorted(
+            news_rows,
+            key=lambda row: (
+                0 if str(row.get("relevance") or "").lower() in {"high", "relevant"} else 1,
+                -abs(float(row.get("ret_t0") or 0.0)),
+            ),
+        )
+        for row in ranked_rows[:5]:
+            trade_date = row.get("trade_date") or str(row.get("date") or "")[:10]
+            title = row.get("title") or "Untitled"
+            key_discussion = row.get("key_discussion") or row.get("summary") or ""
+            sentiment = str(row.get("sentiment") or "neutral").lower()
+            lines.append(
+                f"- `{trade_date}` [{sentiment}] {title}: {str(key_discussion).strip()[:220]}"
+            )
+
+    if positive:
+        lines.append("")
+        lines.append("### Bullish Threads")
+        for row in positive[:3]:
+            reason = row.get("reason_growth") or row.get("key_discussion") or row.get("summary") or row.get("title")
+            lines.append(f"- {str(reason).strip()[:220]}")
+
+    if negative:
+        lines.append("")
+        lines.append("### Bearish Threads")
+        for row in negative[:3]:
+            reason = row.get("reason_decrease") or row.get("key_discussion") or row.get("summary") or row.get("title")
+            lines.append(f"- {str(reason).strip()[:220]}")
+
+    return "\n".join(lines)
+
+
+def get_or_create_stock_story(
+    store: MarketStore,
+    *,
+    symbol: str,
+    as_of_date: str,
+) -> dict[str, Any]:
+    """Return cached story or build a new one from recent market context."""
+    cached = store.get_story_cache(symbol, as_of_date=as_of_date)
+    if cached:
+        return {
+            "symbol": symbol,
+            "as_of_date": as_of_date,
+            "story": cached.get("content") or "",
+            "source": cached.get("source") or "cache",
+        }
+
+    start_date = None
+    if len(as_of_date) >= 10:
+        target_date = datetime.strptime(as_of_date[:10], "%Y-%m-%d").date()
+        start_date = (target_date - timedelta(days=29)).isoformat()
+
+    price_rows = (
+        store.get_ohlc(symbol, start_date, as_of_date)
+        if start_date
+        else []
+    )
+    news_rows = store.get_news_items_enriched(
+        symbol,
+        start_date=start_date,
+        end_date=as_of_date,
+        limit=40,
+    )
+    story = build_stock_story(
+        symbol=symbol,
+        as_of_date=as_of_date,
+        price_rows=price_rows,
+        news_rows=news_rows,
+    )
+    store.upsert_story_cache(symbol, as_of_date=as_of_date, content=story, source="local")
+    return {
+        "symbol": symbol,
+        "as_of_date": as_of_date,
+        "story": story,
+        "source": "local",
+    }