evotraders/backend/enrich/news_enricher.py

# -*- coding: utf-8 -*-
"""Lightweight news enrichment for explain-oriented market analysis."""

from __future__ import annotations

import hashlib
from typing import Any

from backend.config.env_config import get_env_int
from backend.enrich.llm_enricher import (
    analyze_news_row_with_llm,
    analyze_news_rows_with_llm,
    llm_enrichment_enabled,
)
from backend.data.market_store import MarketStore


POSITIVE_KEYWORDS = (
    "beat", "surge", "gain", "growth", "record", "upgrade", "strong",
    "partnership", "approved", "launch", "expands", "profit",
)
NEGATIVE_KEYWORDS = (
    "miss", "drop", "fall", "cut", "downgrade", "weak", "warning",
    "delay", "lawsuit", "probe", "tariff", "decline", "layoff",
)
HIGH_RELEVANCE_KEYWORDS = (
    "earnings", "guidance", "profit", "revenue", "ceo", "fda", "tariff",
    "regulation", "acquisition", "buyback", "forecast", "launch",
)


def _dedupe_key(row: dict[str, Any]) -> str:
    trade_date = str(row.get("trade_date") or row.get("date") or "")[:10]
    title = str(row.get("title") or "").strip().lower()
    summary = str(row.get("summary") or "").strip().lower()[:160]
    raw = f"{trade_date}::{title}::{summary}"
    return hashlib.sha1(raw.encode("utf-8")).hexdigest()


def _chunk_rows(rows: list[dict[str, Any]], size: int) -> list[list[dict[str, Any]]]:
    chunk_size = max(1, int(size))
    return [rows[index:index + chunk_size] for index in range(0, len(rows), chunk_size)]


def classify_news_row(row: dict[str, Any]) -> dict[str, Any]:
    """Return a lightweight explain-oriented analysis for one article."""
    llm_result = analyze_news_row_with_llm(row)
    if isinstance(llm_result, dict):
        merged = dict(llm_result)
        merged.setdefault("summary", str(row.get("summary") or row.get("title") or "")[:280])
        merged.setdefault("raw_json", row)
        merged["analysis_source"] = "llm"
        return merged

    title = str(row.get("title") or "").strip()
    summary = str(row.get("summary") or "").strip()
    text = f"{title} {summary}".lower()

    positive_hits = [keyword for keyword in POSITIVE_KEYWORDS if keyword in text]
    negative_hits = [keyword for keyword in NEGATIVE_KEYWORDS if keyword in text]
    relevance_hits = [keyword for keyword in HIGH_RELEVANCE_KEYWORDS if keyword in text]

    if len(positive_hits) > len(negative_hits):
        sentiment = "positive"
    elif len(negative_hits) > len(positive_hits):
        sentiment = "negative"
    else:
        sentiment = "neutral"

    relevance = "high" if relevance_hits else "medium" if title else "low"
    summary_text = summary or title
    key_discussion = ""
    if relevance_hits:
        key_discussion = f"核心主题集中在 {', '.join(relevance_hits[:3])}"
    elif summary_text:
        key_discussion = summary_text[:160]

    reason_growth = ""
    reason_decrease = ""
    if sentiment == "positive":
        reason_growth = summary_text[:200]
    elif sentiment == "negative":
        reason_decrease = summary_text[:200]

    return {
        "relevance": relevance,
        "sentiment": sentiment,
        "key_discussion": key_discussion,
        "summary": summary_text[:280],
        "reason_growth": reason_growth,
        "reason_decrease": reason_decrease,
        "analysis_source": "local",
        "raw_json": row,
    }


def attach_forward_returns(
    *,
    news_rows: list[dict[str, Any]],
    ohlc_rows: list[dict[str, Any]],
) -> list[dict[str, Any]]:
    """Attach forward-return labels to each analyzed row."""
    if not ohlc_rows:
        return news_rows

    closes_by_date = {
        str(row.get("date")): float(row.get("close"))
        for row in ohlc_rows
        if row.get("date") is not None and row.get("close") is not None
    }
    ordered_dates = [str(row.get("date")) for row in ohlc_rows if row.get("date") is not None]
    date_index = {date: idx for idx, date in enumerate(ordered_dates)}

    horizons = {
        "ret_t0": 0,
        "ret_t1": 1,
        "ret_t3": 3,
        "ret_t5": 5,
        "ret_t10": 10,
    }

    enriched: list[dict[str, Any]] = []
    for row in news_rows:
        trade_date = str(row.get("trade_date") or "")[:10]
        base_close = closes_by_date.get(trade_date)
        if not trade_date or base_close in (None, 0):
            enriched.append(row)
            continue

        next_row = dict(row)
        base_index = date_index.get(trade_date)
        if base_index is None:
            enriched.append(next_row)
            continue

        for field, offset in horizons.items():
            target_index = base_index + offset
            if target_index >= len(ordered_dates):
                next_row[field] = None
                continue
            target_close = closes_by_date.get(ordered_dates[target_index])
            next_row[field] = (
                (float(target_close) - float(base_close)) / float(base_close)
                if target_close not in (None, 0)
                else None
            )
        enriched.append(next_row)
    return enriched


def build_analysis_rows(
    *,
    symbol: str,
    news_rows: list[dict[str, Any]],
    ohlc_rows: list[dict[str, Any]],
) -> tuple[list[dict[str, Any]], dict[str, int]]:
    """Transform raw news rows into market_store news_analysis payloads plus stats."""
    llm_results: dict[str, dict[str, Any]] = {}
    if llm_enrichment_enabled():
        batch_size = get_env_int("EXPLAIN_ENRICH_BATCH_SIZE", 8)
        for chunk in _chunk_rows(news_rows, batch_size):
            llm_results.update(analyze_news_rows_with_llm(chunk))

    staged_rows: list[dict[str, Any]] = []
    seen_dedupe_keys: set[str] = set()
    deduped_count = 0
    llm_count = 0
    local_count = 0
    for row in news_rows:
        news_id = str(row.get("id") or "").strip()
        if not news_id:
            continue
        dedupe_key = _dedupe_key(row)
        if dedupe_key in seen_dedupe_keys:
            deduped_count += 1
            continue
        seen_dedupe_keys.add(dedupe_key)
        batch_result = llm_results.get(news_id)
        if isinstance(batch_result, dict):
            analysis = dict(batch_result)
            analysis.setdefault("summary", str(row.get("summary") or row.get("title") or "")[:280])
            analysis.setdefault("raw_json", row)
            analysis["analysis_source"] = "llm"
            llm_count += 1
        else:
            analysis = classify_news_row(row)
            if analysis.get("analysis_source") == "llm":
                llm_count += 1
            else:
                local_count += 1
        staged_rows.append(
            {
                "news_id": news_id,
                "trade_date": str(row.get("trade_date") or "")[:10] or None,
                **analysis,
            }
        )
    return (
        attach_forward_returns(news_rows=staged_rows, ohlc_rows=ohlc_rows),
        {
            "deduped_count": deduped_count,
            "llm_count": llm_count,
            "local_count": local_count,
        },
    )


def enrich_news_for_symbol(
    store: MarketStore,
    symbol: str,
    *,
    start_date: str | None = None,
    end_date: str | None = None,
    limit: int = 200,
    analysis_source: str = "local",
    skip_existing: bool = True,
    only_reanalyze_local: bool = False,
) -> dict[str, Any]:
    """Read raw market news, compute explain fields, and persist them."""
    normalized_symbol = str(symbol or "").strip().upper()
    if not normalized_symbol:
        return {"symbol": "", "analyzed": 0}

    news_rows = store.get_news_items(
        normalized_symbol,
        start_date=start_date,
        end_date=end_date,
        limit=limit,
    )
    total_news_count = len(news_rows)
    skipped_existing_count = 0
    analyzed_sources: dict[str, str] = {}
    skipped_missing_analysis_count = 0
    skipped_non_local_count = 0
    if news_rows and only_reanalyze_local:
        analyzed_sources = store.get_analyzed_news_sources(
            normalized_symbol,
            start_date=start_date,
            end_date=end_date,
        )
        skipped_missing_analysis_count = sum(
            1
            for row in news_rows
            if str(row.get("id") or "").strip() not in analyzed_sources
        )
        skipped_non_local_count = sum(
            1
            for row in news_rows
            if str(row.get("id") or "").strip() in analyzed_sources
            and analyzed_sources.get(str(row.get("id") or "").strip()) != "local"
        )
        skipped_existing_count = sum(
            1
            for row in news_rows
            if str(row.get("id") or "").strip() not in analyzed_sources
            or analyzed_sources.get(str(row.get("id") or "").strip()) != "local"
        )
        news_rows = [
            row for row in news_rows
            if analyzed_sources.get(str(row.get("id") or "").strip()) == "local"
        ]
    elif skip_existing and news_rows:
        analyzed_ids = store.get_analyzed_news_ids(
            normalized_symbol,
            start_date=start_date,
            end_date=end_date,
        )
        skipped_existing_count = sum(
            1
            for row in news_rows
            if str(row.get("id") or "").strip() in analyzed_ids
        )
        news_rows = [
            row for row in news_rows
            if str(row.get("id") or "").strip() not in analyzed_ids
        ]
    ohlc_start = start_date or (news_rows[-1]["trade_date"] if news_rows and news_rows[-1].get("trade_date") else None)
    ohlc_end = end_date or (news_rows[0]["trade_date"] if news_rows and news_rows[0].get("trade_date") else None)
    ohlc_rows = (
        store.get_ohlc(normalized_symbol, ohlc_start, ohlc_end)
        if ohlc_start and ohlc_end
        else []
    )
    analysis_rows, stats = build_analysis_rows(
        symbol=normalized_symbol,
        news_rows=news_rows,
        ohlc_rows=ohlc_rows,
    )
    analyzed = store.upsert_news_analysis(
        normalized_symbol,
        analysis_rows,
        analysis_source=analysis_source,
    )
    upgraded_dates = sorted(
        {
            str(row.get("trade_date") or "")[:10]
            for row in analysis_rows
            if str(row.get("analysis_source") or "").strip().lower() == "llm"
            and str(row.get("trade_date") or "").strip()
        }
    )
    remaining_local_titles = [
        str(row.get("title") or row.get("news_id") or "").strip()
        for row in news_rows
        for analyzed_row in analysis_rows
        if str(analyzed_row.get("news_id") or "").strip() == str(row.get("id") or "").strip()
        and str(analyzed_row.get("analysis_source") or "").strip().lower() == "local"
    ][:5]
    return {
        "symbol": normalized_symbol,
        "analyzed": analyzed,
        "news_count": total_news_count,
        "queued_count": len(news_rows),
        "skipped_existing_count": skipped_existing_count,
        "deduped_count": stats["deduped_count"],
        "llm_count": stats["llm_count"],
        "local_count": stats["local_count"],
        "only_reanalyze_local": only_reanalyze_local,
        "upgraded_local_to_llm_count": (
            stats["llm_count"]
            if only_reanalyze_local
            else 0
        ),
        "execution_summary": {
            "upgraded_dates": upgraded_dates[:5],
            "remaining_local_titles": remaining_local_titles,
            "skipped_missing_analysis_count": skipped_missing_analysis_count,
            "skipped_non_local_count": skipped_non_local_count,
        },
    }


def enrich_symbols(
    store: MarketStore,
    symbols: list[str],
    *,
    start_date: str | None = None,
    end_date: str | None = None,
    limit: int = 200,
    analysis_source: str = "local",
    skip_existing: bool = True,
    only_reanalyze_local: bool = False,
) -> list[dict[str, Any]]:
    """Batch enrich multiple symbols for explain-oriented news analysis."""
    results = []
    for symbol in symbols:
        normalized_symbol = str(symbol or "").strip().upper()
        if not normalized_symbol:
            continue
        results.append(
            enrich_news_for_symbol(
                store,
                normalized_symbol,
                start_date=start_date,
                end_date=end_date,
                limit=limit,
                analysis_source=analysis_source,
                skip_existing=skip_existing,
                only_reanalyze_local=only_reanalyze_local,
            )
        )
    return results