feat(agent): complete EvoAgent integration for all 6 agent roles

Migrate all agent roles from Legacy to EvoAgent architecture: - fundamentals_analyst, technical_analyst, sentiment_analyst, valuation_analyst - risk_manager, portfolio_manager Key changes: - EvoAgent now supports Portfolio Manager compatibility methods (_make_decision, get_decisions, get_portfolio_state, load_portfolio_state, update_portfolio) - Add UnifiedAgentFactory for centralized agent creation - ToolGuard with batch approval API and WebSocket broadcast - Legacy agents marked deprecated (AnalystAgent, RiskAgent, PMAgent) - Remove backend/agents/compat.py migration shim - Add run_id alongside workspace_id for semantic clarity - Complete integration test coverage (13 tests) - All smoke tests passing for 6 agent roles Constraint: Must maintain backward compatibility with existing run configs Constraint: Memory support must work with EvoAgent (no fallback to Legacy) Rejected: Separate PM implementation for EvoAgent | unified approach cleaner Confidence: high Scope-risk: broad Directive: EVO_AGENT_IDS env var still respected but defaults to all roles Not-tested: Kubernetes sandbox mode for skill execution
2026-04-02 00:55:08 +08:00
parent 0fa413380c
commit 16b54d5ccc
73 changed files with 9454 additions and 904 deletions
--- a/backend/api/agents.py
+++ b/backend/api/agents.py
@@ -2,7 +2,10 @@
 """
 Agent API Routes

-Provides REST API endpoints for agent management within workspaces.
+Provides REST API endpoints for both:
+
+- design-time agent management under `workspaces/`
+- run-scoped agent asset access under `runs/<run_id>/`
 """
 import logging
 import os
@@ -24,6 +27,30 @@ from backend.llm.models import get_agent_model_info
 logger = logging.getLogger(__name__)

 router = APIRouter(prefix="/api/workspaces/{workspace_id}/agents", tags=["agents"])
+DESIGN_SCOPE = "design_workspace"
+RUNTIME_SCOPE = "runtime_run"
+RUNTIME_SCOPE_NOTE = (
+    "For profile, skills, and editable agent files, `workspace_id` is treated "
+    "as the active run id under `runs/<run_id>/`, not as the design-time "
+    "`workspaces/` registry."
+)
+
+
+def _runtime_scope_fields() -> dict[str, str]:
+    return {
+        "scope_type": RUNTIME_SCOPE,
+        "scope_note": RUNTIME_SCOPE_NOTE,
+    }
+
+
+def _design_scope_fields() -> dict[str, str]:
+    return {
+        "scope_type": DESIGN_SCOPE,
+        "scope_note": (
+            "For design-time CRUD routes on this surface, `workspace_id` refers "
+            "to the persistent registry under `workspaces/`."
+        ),
+    }


 # Request/Response Models
@@ -68,30 +95,40 @@ class AgentResponse(BaseModel):
    config_path: str
    agent_dir: str
    status: str = "inactive"
+    scope_type: str = DESIGN_SCOPE
+    scope_note: Optional[str] = None


 class AgentFileResponse(BaseModel):
    """Agent file content response."""
    filename: str
    content: str
+    scope_type: str = RUNTIME_SCOPE
+    scope_note: Optional[str] = None


 class AgentProfileResponse(BaseModel):
    agent_id: str
    workspace_id: str
    profile: Dict[str, Any]
+    scope_type: str = RUNTIME_SCOPE
+    scope_note: Optional[str] = None


 class AgentSkillsResponse(BaseModel):
    agent_id: str
    workspace_id: str
    skills: List[Dict[str, Any]]
+    scope_type: str = RUNTIME_SCOPE
+    scope_note: Optional[str] = None


 class SkillDetailResponse(BaseModel):
    agent_id: str
    workspace_id: str
    skill: Dict[str, Any]
+    scope_type: str = RUNTIME_SCOPE
+    scope_note: Optional[str] = None


 # Dependencies
@@ -101,7 +138,7 @@ def get_agent_factory():


 def get_workspace_manager():
-    """Get run-scoped workspace manager instance."""
+    """Get run-scoped asset manager for one runtime workspace/run id."""
    return RunWorkspaceManager()


@@ -119,7 +156,7 @@ async def create_agent(
    registry = Depends(get_registry),
 ):
    """
-    Create a new agent in a workspace.
+    Create a new agent in a design-time workspace registry entry.

    Args:
        workspace_id: Workspace identifier
@@ -162,6 +199,7 @@ async def create_agent(
            config_path=str(agent.config_path),
            agent_dir=str(agent.agent_dir),
            status="inactive",
+            **_design_scope_fields(),
        )

    except ValueError as e:
@@ -174,7 +212,7 @@ async def list_agents(
    factory: AgentFactory = Depends(get_agent_factory),
 ):
    """
-    List all agents in a workspace.
+    List all agents in a design-time workspace registry entry.

    Args:
        workspace_id: Workspace identifier
@@ -192,6 +230,7 @@ async def list_agents(
                config_path=agent["config_path"],
                agent_dir=str(Path(agent["config_path"]).parent),
                status="inactive",
+                **_design_scope_fields(),
            )
            for agent in agents_data
        ]
@@ -206,7 +245,7 @@ async def get_agent(
    registry = Depends(get_registry),
 ):
    """
-    Get agent details.
+    Get design-time agent details from the persistent workspace registry.

    Args:
        workspace_id: Workspace identifier
@@ -227,6 +266,7 @@ async def get_agent(
        config_path=agent_info.config_path,
        agent_dir=agent_info.agent_dir,
        status=agent_info.status,
+        **_design_scope_fields(),
    )


@@ -275,6 +315,7 @@ async def get_agent_profile(
            "enabled_skills": agent_config.enabled_skills,
            "disabled_skills": agent_config.disabled_skills,
        },
+        **_runtime_scope_fields(),
    )


@@ -310,7 +351,12 @@ async def get_agent_skills(
            "status": status,
        })

-    return AgentSkillsResponse(agent_id=agent_id, workspace_id=workspace_id, skills=payload)
+    return AgentSkillsResponse(
+        agent_id=agent_id,
+        workspace_id=workspace_id,
+        skills=payload,
+        **_runtime_scope_fields(),
+    )


@router.get("/{agent_id}/skills/{skill_name}", response_model=SkillDetailResponse)
@@ -329,7 +375,12 @@ async def get_agent_skill_detail(
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"Unknown skill: {skill_name}")

-    return SkillDetailResponse(agent_id=agent_id, workspace_id=workspace_id, skill=detail)
+    return SkillDetailResponse(
+        agent_id=agent_id,
+        workspace_id=workspace_id,
+        skill=detail,
+        **_runtime_scope_fields(),
+    )


@router.delete("/{agent_id}")
@@ -416,6 +467,7 @@ async def update_agent(
        config_path=agent_info.config_path,
        agent_dir=agent_info.agent_dir,
        status=agent_info.status,
+        **_design_scope_fields(),
    )


@@ -656,7 +708,7 @@ async def get_agent_file(
    workspace_manager: RunWorkspaceManager = Depends(get_workspace_manager),
 ):
    """
-    Read an agent's workspace file.
+    Read an agent file from the run-scoped asset tree under `runs/<run_id>/`.

    Args:
        workspace_id: Workspace identifier
@@ -672,7 +724,11 @@ async def get_agent_file(
            agent_id=agent_id,
            filename=filename,
        )
-        return AgentFileResponse(filename=filename, content=content)
+        return AgentFileResponse(
+            filename=filename,
+            content=content,
+            **_runtime_scope_fields(),
+        )
    except FileNotFoundError:
        raise HTTPException(status_code=404, detail=f"File '{filename}' not found")

@@ -686,7 +742,7 @@ async def update_agent_file(
    workspace_manager: RunWorkspaceManager = Depends(get_workspace_manager),
 ):
    """
-    Update an agent's workspace file.
+    Update an agent file in the run-scoped asset tree under `runs/<run_id>/`.

    Args:
        workspace_id: Workspace identifier
@@ -704,6 +760,10 @@ async def update_agent_file(
            filename=filename,
            content=content,
        )
-        return AgentFileResponse(filename=filename, content=content)
+        return AgentFileResponse(
+            filename=filename,
+            content=content,
+            **_runtime_scope_fields(),
+        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
--- a/backend/api/guard.py
+++ b/backend/api/guard.py
@@ -7,7 +7,7 @@ Provides REST API endpoints for tool guard operations.
 from __future__ import annotations

 from typing import Any, Dict, List, Optional
-from datetime import datetime
+from datetime import UTC, datetime

 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel, Field
@@ -29,7 +29,7 @@ class ToolCallRequest(BaseModel):
    tool_name: str = Field(..., description="Name of the tool")
    tool_input: Dict[str, Any] = Field(default_factory=dict, description="Tool parameters")
    agent_id: str = Field(..., description="Agent making the request")
-    workspace_id: str = Field(..., description="Workspace context")
+    workspace_id: str = Field(..., description="Run context; historical field name retained for compatibility")
    session_id: Optional[str] = Field(None, description="Session identifier")


@@ -46,6 +46,21 @@ class DenyRequest(BaseModel):
    reason: Optional[str] = Field(None, description="Reason for denial")


+class BatchApprovalRequest(BaseModel):
+    """Request to approve multiple tool calls."""
+    approval_ids: List[str] = Field(..., description="List of approval request IDs")
+    one_time: bool = Field(True, description="Whether these are one-time approvals")
+
+
+class BatchApprovalResponse(BaseModel):
+    """Response for batch approval operation."""
+    approved: List[ApprovalResponse] = Field(default_factory=list, description="Successfully approved")
+    failed: List[Dict[str, Any]] = Field(default_factory=list, description="Failed approvals with errors")
+    total_requested: int
+    total_approved: int
+    total_failed: int
+
+
 class ToolFinding(BaseModel):
    """Tool guard finding."""
    severity: SeverityLevel
@@ -61,11 +76,17 @@ class ApprovalResponse(BaseModel):
    tool_input: Dict[str, Any]
    agent_id: str
    workspace_id: str
+    run_id: str
    session_id: Optional[str] = None
    findings: List[ToolFinding] = Field(default_factory=list)
    created_at: str
    resolved_at: Optional[str] = None
    resolved_by: Optional[str] = None
+    scope_type: str = "runtime_run"
+    scope_note: str = (
+        "Approvals are scoped to the active runtime run. `workspace_id` is "
+        "retained as a compatibility field name; prefer `run_id` for display."
+    )


 class PendingApprovalsResponse(BaseModel):
@@ -91,6 +112,7 @@ def _to_response(record: ApprovalRecord) -> ApprovalResponse:
        tool_input=record.tool_input,
        agent_id=record.agent_id,
        workspace_id=record.workspace_id,
+        run_id=record.workspace_id,
        session_id=record.session_id,
        findings=[ToolFinding(**f.to_dict()) for f in record.findings],
        created_at=record.created_at.isoformat(),
@@ -124,7 +146,7 @@ async def check_tool_call(

    if request.tool_name in SAFE_TOOLS:
        record.status = ApprovalStatus.APPROVED
-        record.resolved_at = datetime.utcnow()
+        record.resolved_at = datetime.now(UTC)
        record.resolved_by = "system"
        STORE.set_status(
            record.approval_id,
@@ -156,9 +178,12 @@ async def approve_tool_call(
    if record.status != ApprovalStatus.PENDING:
        raise HTTPException(status_code=400, detail=f"Approval already {record.status}")

-    record.status = ApprovalStatus.APPROVED
-    record.resolved_at = datetime.utcnow()
-    record.resolved_by = "user"
+    record = STORE.set_status(
+        request.approval_id,
+        ApprovalStatus.APPROVED,
+        resolved_by="user",
+        notify_request=True,
+    )

    return _to_response(record)

@@ -183,9 +208,12 @@ async def deny_tool_call(
    if record.status != ApprovalStatus.PENDING:
        raise HTTPException(status_code=400, detail=f"Approval already {record.status}")

-    record.status = ApprovalStatus.DENIED
-    record.resolved_at = datetime.utcnow()
-    record.resolved_by = "user"
+    record = STORE.set_status(
+        request.approval_id,
+        ApprovalStatus.DENIED,
+        resolved_by="user",
+        notify_request=True,
+    )
    record.metadata["denial_reason"] = request.reason

    return _to_response(record)
@@ -200,7 +228,7 @@ async def list_pending_approvals(
    List pending tool approval requests.

    Args:
-        workspace_id: Filter by workspace
+        workspace_id: Filter by run id (historical query parameter name retained)
        agent_id: Filter by agent

    Returns:
@@ -255,3 +283,58 @@ async def cancel_approval(

    STORE.cancel(approval_id)
    return _to_response(record)
+
+
+@router.post("/approve/batch", response_model=BatchApprovalResponse)
+async def batch_approve_tool_calls(
+    request: BatchApprovalRequest,
+):
+    """
+    Approve multiple pending tool calls in a single request.
+
+    Args:
+        request: Batch approval parameters with list of approval IDs
+
+    Returns:
+        Batch approval results with successful and failed approvals
+    """
+    approved: List[ApprovalResponse] = []
+    failed: List[Dict[str, Any]] = []
+
+    for approval_id in request.approval_ids:
+        record = STORE.get(approval_id)
+        if not record:
+            failed.append({
+                "approval_id": approval_id,
+                "error": "Approval request not found",
+            })
+            continue
+
+        if record.status != ApprovalStatus.PENDING:
+            failed.append({
+                "approval_id": approval_id,
+                "error": f"Approval already {record.status}",
+            })
+            continue
+
+        try:
+            record = STORE.set_status(
+                approval_id,
+                ApprovalStatus.APPROVED,
+                resolved_by="user",
+                notify_request=True,
+            )
+            approved.append(_to_response(record))
+        except Exception as e:
+            failed.append({
+                "approval_id": approval_id,
+                "error": str(e),
+            })
+
+    return BatchApprovalResponse(
+        approved=approved,
+        failed=failed,
+        total_requested=len(request.approval_ids),
+        total_approved=len(approved),
+        total_failed=len(failed),
+    )
--- a/backend/api/runtime.py
+++ b/backend/api/runtime.py
@@ -219,6 +219,22 @@ class GatewayStatusResponse(BaseModel):
    is_running: bool
    port: int
    run_id: Optional[str] = None
+    process_status: Optional[str] = None
+    pid: Optional[int] = None
+
+
+class GatewayHealthResponse(BaseModel):
+    status: str
+    checks: Dict[str, Any]
+    timestamp: str
+
+
+class RuntimeModeResponse(BaseModel):
+    mode: str
+    is_backtest: bool
+    run_id: Optional[str] = None
+    schedule_mode: Optional[str] = None
+    is_running: bool


 class RuntimeConfigResponse(BaseModel):
@@ -264,6 +280,49 @@ def _load_run_snapshot(run_id: str) -> Dict[str, Any]:
    return json.loads(snapshot_path.read_text(encoding="utf-8"))


+def _load_run_server_state(run_dir: Path) -> Dict[str, Any]:
+    """Load persisted runtime server state if present."""
+    server_state_path = run_dir / "state" / "server_state.json"
+    if not server_state_path.exists():
+        return {}
+    try:
+        return json.loads(server_state_path.read_text(encoding="utf-8"))
+    except Exception:
+        return {}
+
+
+def _extract_history_metrics(run_dir: Path) -> tuple[int, Optional[float]]:
+    """Prefer runtime state files over dashboard exports for history summaries."""
+    server_state = _load_run_server_state(run_dir)
+    portfolio = server_state.get("portfolio") or {}
+    trades = server_state.get("trades")
+    total_trades = len(trades) if isinstance(trades, list) else 0
+    total_asset_value = None
+    if portfolio.get("total_value") is not None:
+        try:
+            total_asset_value = float(portfolio.get("total_value"))
+        except (TypeError, ValueError):
+            total_asset_value = None
+
+    if total_trades or total_asset_value is not None:
+        return total_trades, total_asset_value
+
+    summary_path = run_dir / "team_dashboard" / "summary.json"
+    if not summary_path.exists():
+        return 0, None
+    try:
+        summary = json.loads(summary_path.read_text(encoding="utf-8"))
+        total_trades = int(summary.get("totalTrades") or 0)
+        total_asset_value = (
+            float(summary.get("totalAssetValue"))
+            if summary.get("totalAssetValue") is not None
+            else None
+        )
+        return total_trades, total_asset_value
+    except Exception:
+        return 0, None
+
+
 def _copy_path_if_exists(src: Path, dst: Path) -> None:
    if not src.exists():
        return
@@ -281,7 +340,7 @@ def _restore_run_assets(source_run_id: str, target_run_dir: Path) -> None:
        raise HTTPException(status_code=404, detail=f"Source run not found: {source_run_id}")

    for relative in [
-        "team_dashboard",
+        "team_dashboard/_internal_state.json",
        "agents",
        "skills",
        "memory",
@@ -307,12 +366,10 @@ def _list_runs(limit: int = 50) -> list[RuntimeHistoryItem]:
    for run_dir in run_dirs[: max(1, int(limit))]:
        run_id = run_dir.name
        runtime_state_path = run_dir / "state" / "runtime_state.json"
-        summary_path = run_dir / "team_dashboard" / "summary.json"

        bootstrap: Dict[str, Any] = {}
        updated_at: Optional[str] = None
-        total_trades = 0
-        total_asset_value: Optional[float] = None
+        total_trades, total_asset_value = _extract_history_metrics(run_dir)

        if runtime_state_path.exists():
            try:
@@ -323,15 +380,6 @@ def _list_runs(limit: int = 50) -> list[RuntimeHistoryItem]:
            except Exception:
                bootstrap = {}

-        if summary_path.exists():
-            try:
-                summary = json.loads(summary_path.read_text(encoding="utf-8"))
-                total_trades = int(summary.get("totalTrades") or 0)
-                total_asset_value = float(summary.get("totalAssetValue")) if summary.get("totalAssetValue") is not None else None
-            except Exception:
-                total_trades = 0
-                total_asset_value = None
-
        items.append(
            RuntimeHistoryItem(
                run_id=run_id,
@@ -436,6 +484,14 @@ def _start_gateway_process(
    port: int
 ) -> subprocess.Popen:
    """Start Gateway as a separate process."""
+    # Validate configuration before starting
+    validation_errors = _validate_gateway_config(bootstrap)
+    if validation_errors:
+        raise HTTPException(
+            status_code=400,
+            detail=f"Gateway configuration validation failed: {'; '.join(validation_errors)}"
+        )
+
    # Prepare environment
    env = os.environ.copy()

@@ -467,6 +523,168 @@ def _start_gateway_process(
    return process


+def _validate_gateway_config(bootstrap: Dict[str, Any]) -> List[str]:
+    """Validate Gateway bootstrap configuration.
+
+    Returns a list of validation error messages. Empty list means valid.
+    """
+    errors: List[str] = []
+
+    # Check required environment variables based on mode
+    mode = bootstrap.get("mode", "live")
+    is_backtest = mode == "backtest"
+
+    # Validate mode
+    if mode not in ("live", "backtest"):
+        errors.append(f"Invalid mode '{mode}': must be 'live' or 'backtest'")
+
+    # Check API keys based on mode
+    if not is_backtest:
+        # Live mode requires FINNHUB_API_KEY
+        finnhub_key = os.getenv("FINNHUB_API_KEY")
+        if not finnhub_key:
+            errors.append("FINNHUB_API_KEY environment variable is required for live mode")
+
+    # Check LLM configuration
+    model_name = os.getenv("MODEL_NAME")
+    openai_key = os.getenv("OPENAI_API_KEY")
+    if not model_name:
+        errors.append("MODEL_NAME environment variable is not set")
+    if not openai_key:
+        errors.append("OPENAI_API_KEY environment variable is not set")
+
+    # Validate tickers
+    tickers = bootstrap.get("tickers", [])
+    if not tickers:
+        errors.append("No tickers specified in configuration")
+    elif not isinstance(tickers, list):
+        errors.append("Tickers must be a list")
+
+    # Validate numeric values
+    try:
+        initial_cash = float(bootstrap.get("initial_cash", 0))
+        if initial_cash <= 0:
+            errors.append("initial_cash must be greater than 0")
+    except (TypeError, ValueError):
+        errors.append("initial_cash must be a valid number")
+
+    try:
+        margin_requirement = float(bootstrap.get("margin_requirement", 0))
+        if margin_requirement < 0 or margin_requirement > 1:
+            errors.append("margin_requirement must be between 0 and 1")
+    except (TypeError, ValueError):
+        errors.append("margin_requirement must be a valid number")
+
+    # Validate backtest dates
+    if is_backtest:
+        start_date = bootstrap.get("start_date")
+        end_date = bootstrap.get("end_date")
+        if not start_date:
+            errors.append("start_date is required for backtest mode")
+        if not end_date:
+            errors.append("end_date is required for backtest mode")
+        if start_date and end_date:
+            try:
+                from datetime import datetime
+                start = datetime.strptime(start_date, "%Y-%m-%d")
+                end = datetime.strptime(end_date, "%Y-%m-%d")
+                if start >= end:
+                    errors.append("start_date must be before end_date")
+            except ValueError:
+                errors.append("Dates must be in YYYY-MM-DD format")
+
+    # Validate schedule mode
+    schedule_mode = bootstrap.get("schedule_mode", "daily")
+    if schedule_mode not in ("daily", "intraday"):
+        errors.append(f"Invalid schedule_mode '{schedule_mode}': must be 'daily' or 'intraday'")
+
+    return errors
+
+
+def _get_gateway_process_details() -> Dict[str, Any]:
+    """Get detailed information about the Gateway process."""
+    process = _runtime_state.gateway_process
+    details = {
+        "pid": None,
+        "status": "not_running",
+        "returncode": None,
+    }
+
+    if process is None:
+        return details
+
+    details["pid"] = process.pid
+    returncode = process.poll()
+
+    if returncode is None:
+        details["status"] = "running"
+        details["returncode"] = None
+    else:
+        details["status"] = "exited"
+        details["returncode"] = returncode
+
+    return details
+
+
+def _check_gateway_health() -> Dict[str, Any]:
+    """Perform comprehensive health checks on Gateway."""
+    checks = {
+        "process": {"status": "unknown", "details": {}},
+        "port": {"status": "unknown", "details": {}},
+        "configuration": {"status": "unknown", "details": {}},
+    }
+
+    # Check process status
+    process_details = _get_gateway_process_details()
+    checks["process"]["details"] = process_details
+
+    if process_details["status"] == "running":
+        checks["process"]["status"] = "healthy"
+    elif process_details["status"] == "exited":
+        checks["process"]["status"] = "unhealthy"
+        checks["process"]["details"]["error"] = f"Process exited with code {process_details['returncode']}"
+    else:
+        checks["process"]["status"] = "unknown"
+
+    # Check port connectivity
+    import socket
+    port = _runtime_state.gateway_port
+    try:
+        with socket.create_connection(("127.0.0.1", port), timeout=2):
+            checks["port"]["status"] = "healthy"
+            checks["port"]["details"] = {"port": port, "accessible": True}
+    except OSError as e:
+        checks["port"]["status"] = "unhealthy"
+        checks["port"]["details"] = {"port": port, "accessible": False, "error": str(e)}
+
+    # Check configuration
+    try:
+        if _runtime_state.runtime_manager is not None:
+            checks["configuration"]["status"] = "healthy"
+            checks["configuration"]["details"]["has_runtime_manager"] = True
+        else:
+            checks["configuration"]["status"] = "degraded"
+            checks["configuration"]["details"]["has_runtime_manager"] = False
+    except Exception as e:
+        checks["configuration"]["status"] = "unknown"
+        checks["configuration"]["details"]["error"] = str(e)
+
+    # Determine overall status
+    statuses = [c["status"] for c in checks.values()]
+    if any(s == "unhealthy" for s in statuses):
+        overall_status = "unhealthy"
+    elif all(s == "healthy" for s in statuses):
+        overall_status = "healthy"
+    else:
+        overall_status = "degraded"
+
+    return {
+        "status": overall_status,
+        "checks": checks,
+        "timestamp": datetime.now().isoformat(),
+    }
+
+
@router.get("/context", response_model=RunContextResponse)
 async def get_run_context() -> RunContextResponse:
    """Return active runtime context, or latest persisted context when stopped."""
@@ -512,9 +730,10 @@ async def get_runtime_history(limit: int = 20) -> RuntimeHistoryResponse:

@router.get("/gateway/status", response_model=GatewayStatusResponse)
 async def get_gateway_status() -> GatewayStatusResponse:
-    """Get Gateway process status and port."""
+    """Get Gateway process status and port with detailed process information."""
    is_running = _is_gateway_running()
    run_id = None
+    process_details = _get_gateway_process_details()

    if is_running:
        try:
@@ -525,10 +744,55 @@ async def get_gateway_status() -> GatewayStatusResponse:
    return GatewayStatusResponse(
        is_running=is_running,
        port=_runtime_state.gateway_port,
-        run_id=run_id
+        run_id=run_id,
+        process_status=process_details["status"],
+        pid=process_details["pid"],
    )


+@router.get("/gateway/health", response_model=GatewayHealthResponse)
+async def get_gateway_health() -> GatewayHealthResponse:
+    """Get comprehensive Gateway health check including process, port, and configuration status."""
+    health = _check_gateway_health()
+    return GatewayHealthResponse(**health)
+
+
+@router.get("/mode", response_model=RuntimeModeResponse)
+async def get_runtime_mode() -> RuntimeModeResponse:
+    """Get current runtime mode (live or backtest) and related configuration."""
+    is_running = _is_gateway_running()
+
+    if not is_running:
+        return RuntimeModeResponse(
+            mode="stopped",
+            is_backtest=False,
+            run_id=None,
+            schedule_mode=None,
+            is_running=False,
+        )
+
+    try:
+        context = _get_active_runtime_context()
+        bootstrap = context.get("bootstrap_values", {})
+        mode = bootstrap.get("mode", "live")
+
+        return RuntimeModeResponse(
+            mode=mode,
+            is_backtest=mode == "backtest",
+            run_id=context.get("config_name"),
+            schedule_mode=bootstrap.get("schedule_mode"),
+            is_running=True,
+        )
+    except HTTPException:
+        return RuntimeModeResponse(
+            mode="unknown",
+            is_backtest=False,
+            run_id=None,
+            schedule_mode=None,
+            is_running=False,
+        )
+
+
@router.get("/gateway/port")
 async def get_gateway_port(request: Request) -> Dict[str, Any]:
    """Get WebSocket Gateway port for frontend connection."""
@@ -807,14 +1071,38 @@ async def start_runtime(
            _runtime_state.gateway_process = None
            log_path = _get_gateway_log_path_for_run(run_id)
            log_tail = _read_log_tail(log_path, max_chars=4000)
+
+            # Build detailed error message
+            error_details = []
+            error_details.append(f"Gateway process exited unexpectedly")
+
+            process_details = _get_gateway_process_details()
+            if process_details.get("returncode") is not None:
+                error_details.append(f"Exit code: {process_details['returncode']}")
+
+            if log_tail:
+                error_details.append(f"Recent log output:\n{log_tail}")
+            else:
+                error_details.append("No log output available. Check environment configuration.")
+
+            # Check common configuration issues
+            config_errors = _validate_gateway_config(bootstrap)
+            if config_errors:
+                error_details.append(f"Configuration issues detected: {'; '.join(config_errors)}")
+
            raise HTTPException(
                status_code=500,
-                detail=f"Gateway failed to start: {log_tail or 'Unknown error'}"
+                detail="\n".join(error_details)
            )

+    except HTTPException:
+        raise
    except Exception as e:
        _stop_gateway()
-        raise HTTPException(status_code=500, detail=f"Failed to start Gateway: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to start Gateway: {type(e).__name__}: {str(e)}"
+        )

    return LaunchResponse(
        run_id=run_id,
@@ -861,17 +1149,38 @@ async def stop_runtime(force: bool = True) -> StopResponse:
    was_running = _is_gateway_running()

    if not was_running:
+        process_details = _get_gateway_process_details()
+        if process_details["status"] == "exited":
+            # Process exited but we have a record of it
+            raise HTTPException(
+                status_code=404,
+                detail=(
+                    f"No runtime is currently running. "
+                    f"Previous Gateway process exited with code {process_details['returncode']}. "
+                    f"PID: {process_details['pid']}"
+                )
+            )
        raise HTTPException(status_code=404, detail="No runtime is currently running")

+    # Get process details before stopping for the response
+    process_details = _get_gateway_process_details()
+    pid_info = f" (PID: {process_details.get('pid')})" if process_details.get('pid') else ""
+
    # Stop Gateway process
-    _stop_gateway()
+    stop_success = _stop_gateway()
+
+    if not stop_success:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to stop Gateway process{pid_info}. Process may have already terminated."
+        )

    # Unregister runtime manager
    unregister_runtime_manager()

    return StopResponse(
        status="stopped",
-        message="Runtime stopped successfully",
+        message=f"Runtime stopped successfully{pid_info}",
    )


--- a/backend/api/workspaces.py
+++ b/backend/api/workspaces.py
@@ -1,8 +1,9 @@
 # -*- coding: utf-8 -*-
 """
-Workspace API Routes
+Workspace API Routes.

-Provides REST API endpoints for workspace management.
+These routes manage the design-time `workspaces/` registry, not the run-scoped
+runtime data under `runs/<run_id>/`.
 """
 from typing import Any, Dict, List, Optional

@@ -31,7 +32,7 @@ class UpdateWorkspaceRequest(BaseModel):


 class WorkspaceResponse(BaseModel):
-    """Workspace information response."""
+    """Design-time workspace information response."""
    workspace_id: str
    name: str
    description: str
@@ -89,10 +90,10 @@ async def list_workspaces(
    manager: WorkspaceManager = Depends(get_workspace_manager),
 ):
    """
-    List all workspaces.
+    List all design-time workspaces.

    Returns:
-        List of workspaces
+        List of design-time workspaces
    """
    workspaces = manager.list_workspaces()
    return WorkspaceListResponse(