Initial commit of integrated agent system

2026-03-30 17:46:44 +08:00
commit 0fa413380c
337 changed files with 75268 additions and 0 deletions
--- a/backend/enrich/init.py
+++ b/backend/enrich/init.py
@@ -0,0 +1,2 @@
+"""News enrichment utilities for explain-oriented market research."""
+
--- a/backend/enrich/llm_enricher.py
+++ b/backend/enrich/llm_enricher.py
@@ -0,0 +1,301 @@
+# -*- coding: utf-8 -*-
+"""Optional AgentScope-backed news enrichment with safe local fallback."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+from backend.config.env_config import canonicalize_model_provider, get_env_bool, get_env_str
+from backend.llm.models import create_model
+
+logger = logging.getLogger(__name__)
+
+
+class EnrichedNewsItem(BaseModel):
+    """Structured output schema for one enriched article."""
+
+    id: str = Field(description="The source article id")
+    relevance: str = Field(description="One of high, medium, low")
+    sentiment: str = Field(description="One of positive, negative, neutral")
+    key_discussion: str = Field(description="Concise core discussion")
+    summary: str = Field(description="Concise factual summary")
+    reason_growth: str = Field(description="Growth-oriented reason if present")
+    reason_decrease: str = Field(description="Downside-oriented reason if present")
+
+
+class EnrichedNewsBatch(BaseModel):
+    """Structured output schema for a batch of enriched articles."""
+
+    items: list[EnrichedNewsItem]
+
+
+class RangeAnalysisPayload(BaseModel):
+    """Structured output schema for range explanation text."""
+
+    summary: str = Field(description="Concise Chinese range summary for the selected window")
+    trend_analysis: str = Field(description="Concise Chinese trend explanation for the selected window")
+    bullish_factors: list[str] = Field(description="Top bullish factors in Chinese")
+    bearish_factors: list[str] = Field(description="Top bearish factors in Chinese")
+
+
+def get_explain_model_info() -> dict[str, str]:
+    """Resolve provider/model used by explain enrichment."""
+    provider = canonicalize_model_provider(
+        get_env_str("EXPLAIN_ENRICH_MODEL_PROVIDER")
+        or get_env_str("MODEL_PROVIDER", "OPENAI"),
+    )
+    model_name = get_env_str("EXPLAIN_ENRICH_MODEL_NAME") or get_env_str(
+        "MODEL_NAME",
+        "gpt-4o-mini",
+    )
+    return {
+        "provider": provider,
+        "model_name": model_name,
+        "label": f"{provider}:{model_name}",
+    }
+
+
+def _normalize_enrichment_payload(payload: Any) -> dict[str, Any] | None:
+    if isinstance(payload, BaseModel):
+        payload = payload.model_dump()
+    if not isinstance(payload, dict):
+        return None
+    return {
+        "relevance": str(payload.get("relevance") or "").strip().lower() or None,
+        "sentiment": str(payload.get("sentiment") or "").strip().lower() or None,
+        "key_discussion": str(payload.get("key_discussion") or "").strip() or None,
+        "summary": str(payload.get("summary") or "").strip() or None,
+        "reason_growth": str(payload.get("reason_growth") or "").strip() or None,
+        "reason_decrease": str(payload.get("reason_decrease") or "").strip() or None,
+        "raw_json": payload,
+    }
+
+
+def _run_async(coro: Any) -> Any:
+    """Run an async AgentScope model call from sync code, even inside a running loop."""
+    try:
+        asyncio.get_running_loop()
+    except RuntimeError:
+        return asyncio.run(coro)
+
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(asyncio.run, coro)
+        return future.result()
+
+
+def _get_explain_model():
+    """Create an AgentScope model for explain enrichment."""
+    model_info = get_explain_model_info()
+    return create_model(
+        model_name=model_info["model_name"],
+        provider=model_info["provider"],
+        stream=False,
+        generate_kwargs={"temperature": 0.1},
+    )
+
+
+def llm_enrichment_enabled() -> bool:
+    """Return whether AgentScope-backed LLM enrichment should be attempted."""
+    if not get_env_bool("EXPLAIN_ENRICH_USE_LLM", False):
+        return False
+    provider = get_explain_model_info()["provider"]
+    provider_key_map = {
+        "OPENAI": "OPENAI_API_KEY",
+        "ANTHROPIC": "ANTHROPIC_API_KEY",
+        "DASHSCOPE": "DASHSCOPE_API_KEY",
+        "ALIBABA": "DASHSCOPE_API_KEY",
+        "GEMINI": "GOOGLE_API_KEY",
+        "GOOGLE": "GOOGLE_API_KEY",
+        "DEEPSEEK": "DEEPSEEK_API_KEY",
+        "GROQ": "GROQ_API_KEY",
+        "OPENROUTER": "OPENROUTER_API_KEY",
+    }
+    env_key = provider_key_map.get(provider)
+    return bool(get_env_str(env_key)) if env_key else provider == "OLLAMA"
+
+
+def llm_range_analysis_enabled() -> bool:
+    """Return whether LLM range analysis should be attempted."""
+    raw_value = get_env_str("EXPLAIN_RANGE_USE_LLM")
+    if raw_value is not None and str(raw_value).strip() != "":
+        return get_env_bool("EXPLAIN_RANGE_USE_LLM", False) and llm_enrichment_enabled()
+    return llm_enrichment_enabled()
+
+
+def analyze_news_row_with_llm(row: dict[str, Any]) -> dict[str, Any] | None:
+    """Generate explain-oriented structured analysis for one article."""
+    if not llm_enrichment_enabled():
+        return None
+
+    model = _get_explain_model()
+    title = str(row.get("title") or "").strip()
+    summary = str(row.get("summary") or "").strip()
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You produce concise structured financial news analysis. "
+                "Use only the requested fields and keep content factual."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "Analyze this stock-news article for an explain UI.\n"
+                "Rules:\n"
+                "- relevance must be one of: high, medium, low\n"
+                "- sentiment must be one of: positive, negative, neutral\n"
+                "- keep each text field concise and factual\n"
+                f"- article id: {str(row.get('id') or '').strip()}\n"
+                f"Title: {title}\n"
+                f"Summary: {summary}\n"
+            ),
+        },
+    ]
+    try:
+        response = _run_async(model(messages=messages, structured_model=EnrichedNewsItem))
+    except Exception as e:
+        logger.warning(f"LLM enrichment failed: {e}")
+        return None
+
+    payload = _normalize_enrichment_payload(getattr(response, "metadata", None))
+    if payload:
+        payload.setdefault("raw_json", {})
+        payload["raw_json"]["model_provider"] = get_explain_model_info()["provider"]
+        payload["raw_json"]["model_name"] = get_explain_model_info()["model_name"]
+        payload["raw_json"]["model_label"] = get_explain_model_info()["label"]
+    return payload
+
+
+def analyze_news_rows_with_llm(rows: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
+    """Generate structured analysis for multiple articles in one request."""
+    if not llm_enrichment_enabled() or not rows:
+        return {}
+
+    payload_rows = [
+        {
+            "id": str(row.get("id") or "").strip(),
+            "title": str(row.get("title") or "").strip(),
+            "summary": str(row.get("summary") or "").strip(),
+        }
+        for row in rows
+        if str(row.get("id") or "").strip()
+    ]
+    if not payload_rows:
+        return {}
+
+    model = _get_explain_model()
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You produce concise structured financial news analysis in JSON. "
+                "Preserve ids exactly and do not invent extra items."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "Analyze these stock-news articles for an explain UI.\n"
+                "For each item return: id, relevance, sentiment, key_discussion, summary, "
+                "reason_growth, reason_decrease.\n"
+                "Rules:\n"
+                "- relevance must be one of: high, medium, low\n"
+                "- sentiment must be one of: positive, negative, neutral\n"
+                "- keep all text concise and factual\n"
+                f"Articles: {payload_rows}"
+            ),
+        },
+    ]
+    try:
+        response = _run_async(
+            model(messages=messages, structured_model=EnrichedNewsBatch),
+        )
+    except Exception:
+        return {}
+
+    metadata = getattr(response, "metadata", None)
+    if isinstance(metadata, BaseModel):
+        metadata = metadata.model_dump()
+    items = metadata.get("items") if isinstance(metadata, dict) else None
+    if not isinstance(items, list):
+        return {}
+
+    results: dict[str, dict[str, Any]] = {}
+    for item in items:
+        normalized = _normalize_enrichment_payload(item)
+        news_id = str((item.model_dump() if isinstance(item, BaseModel) else item).get("id") or "").strip() if isinstance(item, (dict, BaseModel)) else ""
+        if normalized and news_id:
+            normalized.setdefault("raw_json", {})
+            normalized["raw_json"]["model_provider"] = get_explain_model_info()["provider"]
+            normalized["raw_json"]["model_name"] = get_explain_model_info()["model_name"]
+            normalized["raw_json"]["model_label"] = get_explain_model_info()["label"]
+            results[news_id] = normalized
+    return results
+
+
+def analyze_range_with_llm(payload: dict[str, Any]) -> dict[str, Any] | None:
+    """Generate explain-oriented range summary and factor refinement."""
+    if not llm_range_analysis_enabled():
+        return None
+
+    model = _get_explain_model()
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You write concise Chinese stock range analysis for an explain UI. "
+                "Use only the supplied facts. Keep the tone factual and analyst-like."
+            ),
+        },
+        {
+            "role": "user",
+            "content": (
+                "请基于给定事实生成区间分析。\n"
+                "输出字段：summary, trend_analysis, bullish_factors, bearish_factors。\n"
+                "要求：\n"
+                "- 全部使用简体中文\n"
+                "- summary 1到2句，概括区间走势、新闻密度和主导主题\n"
+                "- trend_analysis 1句，解释区间内部阶段变化\n"
+                "- bullish_factors 和 bearish_factors 各返回最多3条短句\n"
+                "- 不要编造未提供的信息\n"
+                f"事实数据: {payload}"
+            ),
+        },
+    ]
+    try:
+        response = _run_async(
+            model(messages=messages, structured_model=RangeAnalysisPayload),
+        )
+    except Exception as e:
+        logger.warning(f"LLM enrichment failed: {e}")
+        return None
+
+    metadata = getattr(response, "metadata", None)
+    if isinstance(metadata, BaseModel):
+        metadata = metadata.model_dump()
+    if not isinstance(metadata, dict):
+        return None
+
+    return {
+        "summary": str(metadata.get("summary") or "").strip() or None,
+        "trend_analysis": str(metadata.get("trend_analysis") or "").strip() or None,
+        "bullish_factors": [
+            str(item).strip()
+            for item in list(metadata.get("bullish_factors") or [])
+            if str(item).strip()
+        ][:3],
+        "bearish_factors": [
+            str(item).strip()
+            for item in list(metadata.get("bearish_factors") or [])
+            if str(item).strip()
+        ][:3],
+        "model_provider": get_explain_model_info()["provider"],
+        "model_name": get_explain_model_info()["model_name"],
+        "model_label": get_explain_model_info()["label"],
+    }
--- a/backend/enrich/news_enricher.py
+++ b/backend/enrich/news_enricher.py
@@ -0,0 +1,362 @@
+# -*- coding: utf-8 -*-
+"""Lightweight news enrichment for explain-oriented market analysis."""
+
+from __future__ import annotations
+
+import hashlib
+from typing import Any
+
+from backend.config.env_config import get_env_int
+from backend.enrich.llm_enricher import (
+    analyze_news_row_with_llm,
+    analyze_news_rows_with_llm,
+    llm_enrichment_enabled,
+)
+from backend.data.market_store import MarketStore
+
+
+POSITIVE_KEYWORDS = (
+    "beat", "surge", "gain", "growth", "record", "upgrade", "strong",
+    "partnership", "approved", "launch", "expands", "profit",
+)
+NEGATIVE_KEYWORDS = (
+    "miss", "drop", "fall", "cut", "downgrade", "weak", "warning",
+    "delay", "lawsuit", "probe", "tariff", "decline", "layoff",
+)
+HIGH_RELEVANCE_KEYWORDS = (
+    "earnings", "guidance", "profit", "revenue", "ceo", "fda", "tariff",
+    "regulation", "acquisition", "buyback", "forecast", "launch",
+)
+
+
+def _dedupe_key(row: dict[str, Any]) -> str:
+    trade_date = str(row.get("trade_date") or row.get("date") or "")[:10]
+    title = str(row.get("title") or "").strip().lower()
+    summary = str(row.get("summary") or "").strip().lower()[:160]
+    raw = f"{trade_date}::{title}::{summary}"
+    return hashlib.sha1(raw.encode("utf-8")).hexdigest()
+
+
+def _chunk_rows(rows: list[dict[str, Any]], size: int) -> list[list[dict[str, Any]]]:
+    chunk_size = max(1, int(size))
+    return [rows[index:index + chunk_size] for index in range(0, len(rows), chunk_size)]
+
+
+def classify_news_row(row: dict[str, Any]) -> dict[str, Any]:
+    """Return a lightweight explain-oriented analysis for one article."""
+    llm_result = analyze_news_row_with_llm(row)
+    if isinstance(llm_result, dict):
+        merged = dict(llm_result)
+        merged.setdefault("summary", str(row.get("summary") or row.get("title") or "")[:280])
+        merged.setdefault("raw_json", row)
+        merged["analysis_source"] = "llm"
+        return merged
+
+    title = str(row.get("title") or "").strip()
+    summary = str(row.get("summary") or "").strip()
+    text = f"{title} {summary}".lower()
+
+    positive_hits = [keyword for keyword in POSITIVE_KEYWORDS if keyword in text]
+    negative_hits = [keyword for keyword in NEGATIVE_KEYWORDS if keyword in text]
+    relevance_hits = [keyword for keyword in HIGH_RELEVANCE_KEYWORDS if keyword in text]
+
+    if len(positive_hits) > len(negative_hits):
+        sentiment = "positive"
+    elif len(negative_hits) > len(positive_hits):
+        sentiment = "negative"
+    else:
+        sentiment = "neutral"
+
+    relevance = "high" if relevance_hits else "medium" if title else "low"
+    summary_text = summary or title
+    key_discussion = ""
+    if relevance_hits:
+        key_discussion = f"核心主题集中在 {', '.join(relevance_hits[:3])}"
+    elif summary_text:
+        key_discussion = summary_text[:160]
+
+    reason_growth = ""
+    reason_decrease = ""
+    if sentiment == "positive":
+        reason_growth = summary_text[:200]
+    elif sentiment == "negative":
+        reason_decrease = summary_text[:200]
+
+    return {
+        "relevance": relevance,
+        "sentiment": sentiment,
+        "key_discussion": key_discussion,
+        "summary": summary_text[:280],
+        "reason_growth": reason_growth,
+        "reason_decrease": reason_decrease,
+        "analysis_source": "local",
+        "raw_json": row,
+    }
+
+
+def attach_forward_returns(
+    *,
+    news_rows: list[dict[str, Any]],
+    ohlc_rows: list[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    """Attach forward-return labels to each analyzed row."""
+    if not ohlc_rows:
+        return news_rows
+
+    closes_by_date = {
+        str(row.get("date")): float(row.get("close"))
+        for row in ohlc_rows
+        if row.get("date") is not None and row.get("close") is not None
+    }
+    ordered_dates = [str(row.get("date")) for row in ohlc_rows if row.get("date") is not None]
+    date_index = {date: idx for idx, date in enumerate(ordered_dates)}
+
+    horizons = {
+        "ret_t0": 0,
+        "ret_t1": 1,
+        "ret_t3": 3,
+        "ret_t5": 5,
+        "ret_t10": 10,
+    }
+
+    enriched: list[dict[str, Any]] = []
+    for row in news_rows:
+        trade_date = str(row.get("trade_date") or "")[:10]
+        base_close = closes_by_date.get(trade_date)
+        if not trade_date or base_close in (None, 0):
+            enriched.append(row)
+            continue
+
+        next_row = dict(row)
+        base_index = date_index.get(trade_date)
+        if base_index is None:
+            enriched.append(next_row)
+            continue
+
+        for field, offset in horizons.items():
+            target_index = base_index + offset
+            if target_index >= len(ordered_dates):
+                next_row[field] = None
+                continue
+            target_close = closes_by_date.get(ordered_dates[target_index])
+            next_row[field] = (
+                (float(target_close) - float(base_close)) / float(base_close)
+                if target_close not in (None, 0)
+                else None
+            )
+        enriched.append(next_row)
+    return enriched
+
+
+def build_analysis_rows(
+    *,
+    symbol: str,
+    news_rows: list[dict[str, Any]],
+    ohlc_rows: list[dict[str, Any]],
+) -> tuple[list[dict[str, Any]], dict[str, int]]:
+    """Transform raw news rows into market_store news_analysis payloads plus stats."""
+    llm_results: dict[str, dict[str, Any]] = {}
+    if llm_enrichment_enabled():
+        batch_size = get_env_int("EXPLAIN_ENRICH_BATCH_SIZE", 8)
+        for chunk in _chunk_rows(news_rows, batch_size):
+            llm_results.update(analyze_news_rows_with_llm(chunk))
+
+    staged_rows: list[dict[str, Any]] = []
+    seen_dedupe_keys: set[str] = set()
+    deduped_count = 0
+    llm_count = 0
+    local_count = 0
+    for row in news_rows:
+        news_id = str(row.get("id") or "").strip()
+        if not news_id:
+            continue
+        dedupe_key = _dedupe_key(row)
+        if dedupe_key in seen_dedupe_keys:
+            deduped_count += 1
+            continue
+        seen_dedupe_keys.add(dedupe_key)
+        batch_result = llm_results.get(news_id)
+        if isinstance(batch_result, dict):
+            analysis = dict(batch_result)
+            analysis.setdefault("summary", str(row.get("summary") or row.get("title") or "")[:280])
+            analysis.setdefault("raw_json", row)
+            analysis["analysis_source"] = "llm"
+            llm_count += 1
+        else:
+            analysis = classify_news_row(row)
+            if analysis.get("analysis_source") == "llm":
+                llm_count += 1
+            else:
+                local_count += 1
+        staged_rows.append(
+            {
+                "news_id": news_id,
+                "trade_date": str(row.get("trade_date") or "")[:10] or None,
+                **analysis,
+            }
+        )
+    return (
+        attach_forward_returns(news_rows=staged_rows, ohlc_rows=ohlc_rows),
+        {
+            "deduped_count": deduped_count,
+            "llm_count": llm_count,
+            "local_count": local_count,
+        },
+    )
+
+
+def enrich_news_for_symbol(
+    store: MarketStore,
+    symbol: str,
+    *,
+    start_date: str | None = None,
+    end_date: str | None = None,
+    limit: int = 200,
+    analysis_source: str = "local",
+    skip_existing: bool = True,
+    only_reanalyze_local: bool = False,
+) -> dict[str, Any]:
+    """Read raw market news, compute explain fields, and persist them."""
+    normalized_symbol = str(symbol or "").strip().upper()
+    if not normalized_symbol:
+        return {"symbol": "", "analyzed": 0}
+
+    news_rows = store.get_news_items(
+        normalized_symbol,
+        start_date=start_date,
+        end_date=end_date,
+        limit=limit,
+    )
+    total_news_count = len(news_rows)
+    skipped_existing_count = 0
+    analyzed_sources: dict[str, str] = {}
+    skipped_missing_analysis_count = 0
+    skipped_non_local_count = 0
+    if news_rows and only_reanalyze_local:
+        analyzed_sources = store.get_analyzed_news_sources(
+            normalized_symbol,
+            start_date=start_date,
+            end_date=end_date,
+        )
+        skipped_missing_analysis_count = sum(
+            1
+            for row in news_rows
+            if str(row.get("id") or "").strip() not in analyzed_sources
+        )
+        skipped_non_local_count = sum(
+            1
+            for row in news_rows
+            if str(row.get("id") or "").strip() in analyzed_sources
+            and analyzed_sources.get(str(row.get("id") or "").strip()) != "local"
+        )
+        skipped_existing_count = sum(
+            1
+            for row in news_rows
+            if str(row.get("id") or "").strip() not in analyzed_sources
+            or analyzed_sources.get(str(row.get("id") or "").strip()) != "local"
+        )
+        news_rows = [
+            row for row in news_rows
+            if analyzed_sources.get(str(row.get("id") or "").strip()) == "local"
+        ]
+    elif skip_existing and news_rows:
+        analyzed_ids = store.get_analyzed_news_ids(
+            normalized_symbol,
+            start_date=start_date,
+            end_date=end_date,
+        )
+        skipped_existing_count = sum(
+            1
+            for row in news_rows
+            if str(row.get("id") or "").strip() in analyzed_ids
+        )
+        news_rows = [
+            row for row in news_rows
+            if str(row.get("id") or "").strip() not in analyzed_ids
+        ]
+    ohlc_start = start_date or (news_rows[-1]["trade_date"] if news_rows and news_rows[-1].get("trade_date") else None)
+    ohlc_end = end_date or (news_rows[0]["trade_date"] if news_rows and news_rows[0].get("trade_date") else None)
+    ohlc_rows = (
+        store.get_ohlc(normalized_symbol, ohlc_start, ohlc_end)
+        if ohlc_start and ohlc_end
+        else []
+    )
+    analysis_rows, stats = build_analysis_rows(
+        symbol=normalized_symbol,
+        news_rows=news_rows,
+        ohlc_rows=ohlc_rows,
+    )
+    analyzed = store.upsert_news_analysis(
+        normalized_symbol,
+        analysis_rows,
+        analysis_source=analysis_source,
+    )
+    upgraded_dates = sorted(
+        {
+            str(row.get("trade_date") or "")[:10]
+            for row in analysis_rows
+            if str(row.get("analysis_source") or "").strip().lower() == "llm"
+            and str(row.get("trade_date") or "").strip()
+        }
+    )
+    remaining_local_titles = [
+        str(row.get("title") or row.get("news_id") or "").strip()
+        for row in news_rows
+        for analyzed_row in analysis_rows
+        if str(analyzed_row.get("news_id") or "").strip() == str(row.get("id") or "").strip()
+        and str(analyzed_row.get("analysis_source") or "").strip().lower() == "local"
+    ][:5]
+    return {
+        "symbol": normalized_symbol,
+        "analyzed": analyzed,
+        "news_count": total_news_count,
+        "queued_count": len(news_rows),
+        "skipped_existing_count": skipped_existing_count,
+        "deduped_count": stats["deduped_count"],
+        "llm_count": stats["llm_count"],
+        "local_count": stats["local_count"],
+        "only_reanalyze_local": only_reanalyze_local,
+        "upgraded_local_to_llm_count": (
+            stats["llm_count"]
+            if only_reanalyze_local
+            else 0
+        ),
+        "execution_summary": {
+            "upgraded_dates": upgraded_dates[:5],
+            "remaining_local_titles": remaining_local_titles,
+            "skipped_missing_analysis_count": skipped_missing_analysis_count,
+            "skipped_non_local_count": skipped_non_local_count,
+        },
+    }
+
+
+def enrich_symbols(
+    store: MarketStore,
+    symbols: list[str],
+    *,
+    start_date: str | None = None,
+    end_date: str | None = None,
+    limit: int = 200,
+    analysis_source: str = "local",
+    skip_existing: bool = True,
+    only_reanalyze_local: bool = False,
+) -> list[dict[str, Any]]:
+    """Batch enrich multiple symbols for explain-oriented news analysis."""
+    results = []
+    for symbol in symbols:
+        normalized_symbol = str(symbol or "").strip().upper()
+        if not normalized_symbol:
+            continue
+        results.append(
+            enrich_news_for_symbol(
+                store,
+                normalized_symbol,
+                start_date=start_date,
+                end_date=end_date,
+                limit=limit,
+                analysis_source=analysis_source,
+                skip_existing=skip_existing,
+                only_reanalyze_local=only_reanalyze_local,
+            )
+        )
+    return results
				`@@ -0,0 +1,2 @@`
				`"""News enrichment utilities for explain-oriented market research."""`