feat(agent): complete EvoAgent integration for all 6 agent roles

Migrate all agent roles from Legacy to EvoAgent architecture: - fundamentals_analyst, technical_analyst, sentiment_analyst, valuation_analyst - risk_manager, portfolio_manager Key changes: - EvoAgent now supports Portfolio Manager compatibility methods (_make_decision, get_decisions, get_portfolio_state, load_portfolio_state, update_portfolio) - Add UnifiedAgentFactory for centralized agent creation - ToolGuard with batch approval API and WebSocket broadcast - Legacy agents marked deprecated (AnalystAgent, RiskAgent, PMAgent) - Remove backend/agents/compat.py migration shim - Add run_id alongside workspace_id for semantic clarity - Complete integration test coverage (13 tests) - All smoke tests passing for 6 agent roles Constraint: Must maintain backward compatibility with existing run configs Constraint: Memory support must work with EvoAgent (no fallback to Legacy) Rejected: Separate PM implementation for EvoAgent | unified approach cleaner Confidence: high Scope-risk: broad Directive: EVO_AGENT_IDS env var still respected but defaults to all roles Not-tested: Kubernetes sandbox mode for skill execution
2026-04-02 00:55:08 +08:00
parent 0fa413380c
commit 16b54d5ccc
73 changed files with 9454 additions and 904 deletions
--- a/backend/api/runtime.py
+++ b/backend/api/runtime.py
@@ -219,6 +219,22 @@ class GatewayStatusResponse(BaseModel):
    is_running: bool
    port: int
    run_id: Optional[str] = None
+    process_status: Optional[str] = None
+    pid: Optional[int] = None
+
+
+class GatewayHealthResponse(BaseModel):
+    status: str
+    checks: Dict[str, Any]
+    timestamp: str
+
+
+class RuntimeModeResponse(BaseModel):
+    mode: str
+    is_backtest: bool
+    run_id: Optional[str] = None
+    schedule_mode: Optional[str] = None
+    is_running: bool


 class RuntimeConfigResponse(BaseModel):
@@ -264,6 +280,49 @@ def _load_run_snapshot(run_id: str) -> Dict[str, Any]:
    return json.loads(snapshot_path.read_text(encoding="utf-8"))


+def _load_run_server_state(run_dir: Path) -> Dict[str, Any]:
+    """Load persisted runtime server state if present."""
+    server_state_path = run_dir / "state" / "server_state.json"
+    if not server_state_path.exists():
+        return {}
+    try:
+        return json.loads(server_state_path.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+
+
+def _extract_history_metrics(run_dir: Path) -> tuple[int, Optional[float]]:
+    """Prefer runtime state files over dashboard exports for history summaries."""
+    server_state = _load_run_server_state(run_dir)
+    portfolio = server_state.get("portfolio") or {}
+    trades = server_state.get("trades")
+    total_trades = len(trades) if isinstance(trades, list) else 0
+    total_asset_value = None
+    if portfolio.get("total_value") is not None:
+        try:
+            total_asset_value = float(portfolio.get("total_value"))
+        except (TypeError, ValueError):
+            total_asset_value = None
+
+    if total_trades or total_asset_value is not None:
+        return total_trades, total_asset_value
+
+    summary_path = run_dir / "team_dashboard" / "summary.json"
+    if not summary_path.exists():
+        return 0, None
+    try:
+        summary = json.loads(summary_path.read_text(encoding="utf-8"))
+        total_trades = int(summary.get("totalTrades") or 0)
+        total_asset_value = (
+            float(summary.get("totalAssetValue"))
+            if summary.get("totalAssetValue") is not None
+            else None
+        )
+        return total_trades, total_asset_value
+    except Exception:
+        return 0, None
+
+
 def _copy_path_if_exists(src: Path, dst: Path) -> None:
    if not src.exists():
        return
@@ -281,7 +340,7 @@ def _restore_run_assets(source_run_id: str, target_run_dir: Path) -> None:
        raise HTTPException(status_code=404, detail=f"Source run not found: {source_run_id}")

    for relative in [
-        "team_dashboard",
+        "team_dashboard/_internal_state.json",
        "agents",
        "skills",
        "memory",
@@ -307,12 +366,10 @@ def _list_runs(limit: int = 50) -> list[RuntimeHistoryItem]:
    for run_dir in run_dirs[: max(1, int(limit))]:
        run_id = run_dir.name
        runtime_state_path = run_dir / "state" / "runtime_state.json"
-        summary_path = run_dir / "team_dashboard" / "summary.json"

        bootstrap: Dict[str, Any] = {}
        updated_at: Optional[str] = None
-        total_trades = 0
-        total_asset_value: Optional[float] = None
+        total_trades, total_asset_value = _extract_history_metrics(run_dir)

        if runtime_state_path.exists():
            try:
@@ -323,15 +380,6 @@ def _list_runs(limit: int = 50) -> list[RuntimeHistoryItem]:
            except Exception:
                bootstrap = {}

-        if summary_path.exists():
-            try:
-                summary = json.loads(summary_path.read_text(encoding="utf-8"))
-                total_trades = int(summary.get("totalTrades") or 0)
-                total_asset_value = float(summary.get("totalAssetValue")) if summary.get("totalAssetValue") is not None else None
-            except Exception:
-                total_trades = 0
-                total_asset_value = None
-
        items.append(
            RuntimeHistoryItem(
                run_id=run_id,
@@ -436,6 +484,14 @@ def _start_gateway_process(
    port: int
 ) -> subprocess.Popen:
    """Start Gateway as a separate process."""
+    # Validate configuration before starting
+    validation_errors = _validate_gateway_config(bootstrap)
+    if validation_errors:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Gateway configuration validation failed: {'; '.join(validation_errors)}"
+        )
+
    # Prepare environment
    env = os.environ.copy()

@@ -467,6 +523,168 @@ def _start_gateway_process(
    return process


+def _validate_gateway_config(bootstrap: Dict[str, Any]) -> List[str]:
+    """Validate Gateway bootstrap configuration.
+
+    Returns a list of validation error messages. Empty list means valid.
+    """
+    errors: List[str] = []
+
+    # Check required environment variables based on mode
+    mode = bootstrap.get("mode", "live")
+    is_backtest = mode == "backtest"
+
+    # Validate mode
+    if mode not in ("live", "backtest"):
+        errors.append(f"Invalid mode '{mode}': must be 'live' or 'backtest'")
+
+    # Check API keys based on mode
+    if not is_backtest:
+        # Live mode requires FINNHUB_API_KEY
+        finnhub_key = os.getenv("FINNHUB_API_KEY")
+        if not finnhub_key:
+            errors.append("FINNHUB_API_KEY environment variable is required for live mode")
+
+    # Check LLM configuration
+    model_name = os.getenv("MODEL_NAME")
+    openai_key = os.getenv("OPENAI_API_KEY")
+    if not model_name:
+        errors.append("MODEL_NAME environment variable is not set")
+    if not openai_key:
+        errors.append("OPENAI_API_KEY environment variable is not set")
+
+    # Validate tickers
+    tickers = bootstrap.get("tickers", [])
+    if not tickers:
+        errors.append("No tickers specified in configuration")
+    elif not isinstance(tickers, list):
+        errors.append("Tickers must be a list")
+
+    # Validate numeric values
+    try:
+        initial_cash = float(bootstrap.get("initial_cash", 0))
+        if initial_cash <= 0:
+            errors.append("initial_cash must be greater than 0")
+    except (TypeError, ValueError):
+        errors.append("initial_cash must be a valid number")
+
+    try:
+        margin_requirement = float(bootstrap.get("margin_requirement", 0))
+        if margin_requirement < 0 or margin_requirement > 1:
+            errors.append("margin_requirement must be between 0 and 1")
+    except (TypeError, ValueError):
+        errors.append("margin_requirement must be a valid number")
+
+    # Validate backtest dates
+    if is_backtest:
+        start_date = bootstrap.get("start_date")
+        end_date = bootstrap.get("end_date")
+        if not start_date:
+            errors.append("start_date is required for backtest mode")
+        if not end_date:
+            errors.append("end_date is required for backtest mode")
+        if start_date and end_date:
+            try:
+                from datetime import datetime
+                start = datetime.strptime(start_date, "%Y-%m-%d")
+                end = datetime.strptime(end_date, "%Y-%m-%d")
+                if start >= end:
+                    errors.append("start_date must be before end_date")
+            except ValueError:
+                errors.append("Dates must be in YYYY-MM-DD format")
+
+    # Validate schedule mode
+    schedule_mode = bootstrap.get("schedule_mode", "daily")
+    if schedule_mode not in ("daily", "intraday"):
+        errors.append(f"Invalid schedule_mode '{schedule_mode}': must be 'daily' or 'intraday'")
+
+    return errors
+
+
+def _get_gateway_process_details() -> Dict[str, Any]:
+    """Get detailed information about the Gateway process."""
+    process = _runtime_state.gateway_process
+    details = {
+        "pid": None,
+        "status": "not_running",
+        "returncode": None,
+    }
+
+    if process is None:
+        return details
+
+    details["pid"] = process.pid
+    returncode = process.poll()
+
+    if returncode is None:
+        details["status"] = "running"
+        details["returncode"] = None
+    else:
+        details["status"] = "exited"
+        details["returncode"] = returncode
+
+    return details
+
+
+def _check_gateway_health() -> Dict[str, Any]:
+    """Perform comprehensive health checks on Gateway."""
+    checks = {
+        "process": {"status": "unknown", "details": {}},
+        "port": {"status": "unknown", "details": {}},
+        "configuration": {"status": "unknown", "details": {}},
+    }
+
+    # Check process status
+    process_details = _get_gateway_process_details()
+    checks["process"]["details"] = process_details
+
+    if process_details["status"] == "running":
+        checks["process"]["status"] = "healthy"
+    elif process_details["status"] == "exited":
+        checks["process"]["status"] = "unhealthy"
+        checks["process"]["details"]["error"] = f"Process exited with code {process_details['returncode']}"
+    else:
+        checks["process"]["status"] = "unknown"
+
+    # Check port connectivity
+    import socket
+    port = _runtime_state.gateway_port
+    try:
+        with socket.create_connection(("127.0.0.1", port), timeout=2):
+            checks["port"]["status"] = "healthy"
+            checks["port"]["details"] = {"port": port, "accessible": True}
+    except OSError as e:
+        checks["port"]["status"] = "unhealthy"
+        checks["port"]["details"] = {"port": port, "accessible": False, "error": str(e)}
+
+    # Check configuration
+    try:
+        if _runtime_state.runtime_manager is not None:
+            checks["configuration"]["status"] = "healthy"
+            checks["configuration"]["details"]["has_runtime_manager"] = True
+        else:
+            checks["configuration"]["status"] = "degraded"
+            checks["configuration"]["details"]["has_runtime_manager"] = False
+    except Exception as e:
+        checks["configuration"]["status"] = "unknown"
+        checks["configuration"]["details"]["error"] = str(e)
+
+    # Determine overall status
+    statuses = [c["status"] for c in checks.values()]
+    if any(s == "unhealthy" for s in statuses):
+        overall_status = "unhealthy"
+    elif all(s == "healthy" for s in statuses):
+        overall_status = "healthy"
+    else:
+        overall_status = "degraded"
+
+    return {
+        "status": overall_status,
+        "checks": checks,
+        "timestamp": datetime.now().isoformat(),
+    }
+
+
@router.get("/context", response_model=RunContextResponse)
 async def get_run_context() -> RunContextResponse:
    """Return active runtime context, or latest persisted context when stopped."""
@@ -512,9 +730,10 @@ async def get_runtime_history(limit: int = 20) -> RuntimeHistoryResponse:

@router.get("/gateway/status", response_model=GatewayStatusResponse)
 async def get_gateway_status() -> GatewayStatusResponse:
-    """Get Gateway process status and port."""
+    """Get Gateway process status and port with detailed process information."""
    is_running = _is_gateway_running()
    run_id = None
+    process_details = _get_gateway_process_details()

    if is_running:
        try:
@@ -525,10 +744,55 @@ async def get_gateway_status() -> GatewayStatusResponse:
    return GatewayStatusResponse(
        is_running=is_running,
        port=_runtime_state.gateway_port,
-        run_id=run_id
+        run_id=run_id,
+        process_status=process_details["status"],
+        pid=process_details["pid"],
    )


+@router.get("/gateway/health", response_model=GatewayHealthResponse)
+async def get_gateway_health() -> GatewayHealthResponse:
+    """Get comprehensive Gateway health check including process, port, and configuration status."""
+    health = _check_gateway_health()
+    return GatewayHealthResponse(**health)
+
+
+@router.get("/mode", response_model=RuntimeModeResponse)
+async def get_runtime_mode() -> RuntimeModeResponse:
+    """Get current runtime mode (live or backtest) and related configuration."""
+    is_running = _is_gateway_running()
+
+    if not is_running:
+        return RuntimeModeResponse(
+            mode="stopped",
+            is_backtest=False,
+            run_id=None,
+            schedule_mode=None,
+            is_running=False,
+        )
+
+    try:
+        context = _get_active_runtime_context()
+        bootstrap = context.get("bootstrap_values", {})
+        mode = bootstrap.get("mode", "live")
+
+        return RuntimeModeResponse(
+            mode=mode,
+            is_backtest=mode == "backtest",
+            run_id=context.get("config_name"),
+            schedule_mode=bootstrap.get("schedule_mode"),
+            is_running=True,
+        )
+    except HTTPException:
+        return RuntimeModeResponse(
+            mode="unknown",
+            is_backtest=False,
+            run_id=None,
+            schedule_mode=None,
+            is_running=False,
+        )
+
+
@router.get("/gateway/port")
 async def get_gateway_port(request: Request) -> Dict[str, Any]:
    """Get WebSocket Gateway port for frontend connection."""
@@ -807,14 +1071,38 @@ async def start_runtime(
            _runtime_state.gateway_process = None
            log_path = _get_gateway_log_path_for_run(run_id)
            log_tail = _read_log_tail(log_path, max_chars=4000)
+
+            # Build detailed error message
+            error_details = []
+            error_details.append(f"Gateway process exited unexpectedly")
+
+            process_details = _get_gateway_process_details()
+            if process_details.get("returncode") is not None:
+                error_details.append(f"Exit code: {process_details['returncode']}")
+
+            if log_tail:
+                error_details.append(f"Recent log output:\n{log_tail}")
+            else:
+                error_details.append("No log output available. Check environment configuration.")
+
+            # Check common configuration issues
+            config_errors = _validate_gateway_config(bootstrap)
+            if config_errors:
+                error_details.append(f"Configuration issues detected: {'; '.join(config_errors)}")
+
            raise HTTPException(
                status_code=500,
-                detail=f"Gateway failed to start: {log_tail or 'Unknown error'}"
+                detail="\n".join(error_details)
            )

+    except HTTPException:
+        raise
    except Exception as e:
        _stop_gateway()
-        raise HTTPException(status_code=500, detail=f"Failed to start Gateway: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to start Gateway: {type(e).__name__}: {str(e)}"
+        )

    return LaunchResponse(
        run_id=run_id,
@@ -861,17 +1149,38 @@ async def stop_runtime(force: bool = True) -> StopResponse:
    was_running = _is_gateway_running()

    if not was_running:
+        process_details = _get_gateway_process_details()
+        if process_details["status"] == "exited":
+            # Process exited but we have a record of it
+            raise HTTPException(
+                status_code=404,
+                detail=(
+                    f"No runtime is currently running. "
+                    f"Previous Gateway process exited with code {process_details['returncode']}. "
+                    f"PID: {process_details['pid']}"
+                )
+            )
        raise HTTPException(status_code=404, detail="No runtime is currently running")

+    # Get process details before stopping for the response
+    process_details = _get_gateway_process_details()
+    pid_info = f" (PID: {process_details.get('pid')})" if process_details.get('pid') else ""
+
    # Stop Gateway process
-    _stop_gateway()
+    stop_success = _stop_gateway()
+
+    if not stop_success:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to stop Gateway process{pid_info}. Process may have already terminated."
+        )

    # Unregister runtime manager
    unregister_runtime_manager()

    return StopResponse(
        status="stopped",
-        message="Runtime stopped successfully",
+        message=f"Runtime stopped successfully{pid_info}",
    )