feat: Add evaluation hooks, skill adaptation and team pipeline config

- Add EvaluationHook for post-execution agent evaluation - Add SkillAdaptationHook for dynamic skill adaptation - Add team/ directory with team coordination logic - Add TEAM_PIPELINE.yaml for smoke_fullstack pipeline config - Update RuntimeView, TraderView and RuntimeSettingsPanel UI - Add runtimeApi and websocket services - Add runtime_state.json to smoke_fullstack state Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-19 18:52:12 +08:00
parent f4a2b7f3af
commit 4b5ac86b83
87 changed files with 5042 additions and 744 deletions
--- a/backend/api/agents.py
+++ b/backend/api/agents.py
@@ -4,15 +4,20 @@ Agent API Routes

 Provides REST API endpoints for agent management within workspaces.
 """
-from typing import Any, Dict, List, Optional
+import logging
+import os
+import tempfile
 from pathlib import Path
+from typing import Any, Dict, List, Optional

-from fastapi import APIRouter, HTTPException, Depends, Body
+from fastapi import APIRouter, HTTPException, Depends, Body, UploadFile, File, Form
 from pydantic import BaseModel, Field

 from backend.agents import AgentFactory, WorkspaceManager, get_registry
 from backend.agents.skills_manager import SkillsManager

+logger = logging.getLogger(__name__)
+
 router = APIRouter(prefix="/api/workspaces/{workspace_id}/agents", tags=["agents"])


@@ -35,6 +40,13 @@ class UpdateAgentRequest(BaseModel):
    disabled_skills: Optional[List[str]] = None


+class InstallExternalSkillRequest(BaseModel):
+    """Request to install an external skill for one agent."""
+    source: str = Field(..., description="Directory path, zip path, or http(s) zip URL")
+    name: Optional[str] = Field(None, description="Optional override skill name")
+    activate: bool = Field(True, description="Whether to enable skill immediately")
+
+
 class AgentResponse(BaseModel):
    """Agent information response."""
    agent_id: str
@@ -344,6 +356,86 @@ async def disable_skill(
    }


+@router.post("/{agent_id}/skills/install")
+async def install_external_skill(
+    workspace_id: str,
+    agent_id: str,
+    request: InstallExternalSkillRequest,
+    registry=Depends(get_registry),
+):
+    """Install an external skill into one agent's local skills."""
+    agent_info = registry.get(agent_id)
+    if not agent_info or agent_info.workspace_id != workspace_id:
+        raise HTTPException(status_code=404, detail=f"Agent '{agent_id}' not found")
+
+    skills_manager = SkillsManager()
+    try:
+        result = skills_manager.install_external_skill_for_agent(
+            config_name=workspace_id,
+            agent_id=agent_id,
+            source=request.source,
+            skill_name=request.name,
+            activate=request.activate,
+        )
+    except (FileNotFoundError, ValueError) as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+
+    return {
+        "message": f"Installed external skill '{result['skill_name']}' for '{agent_id}'",
+        **result,
+    }
+
+
+@router.post("/{agent_id}/skills/upload")
+async def upload_external_skill(
+    workspace_id: str,
+    agent_id: str,
+    file: UploadFile = File(...),
+    name: Optional[str] = Form(None),
+    activate: bool = Form(True),
+    registry=Depends(get_registry),
+):
+    """Upload a zip skill package from frontend and install for one agent."""
+    agent_info = registry.get(agent_id)
+    if not agent_info or agent_info.workspace_id != workspace_id:
+        raise HTTPException(status_code=404, detail=f"Agent '{agent_id}' not found")
+
+    original_name = (file.filename or "").strip()
+    if not original_name.lower().endswith(".zip"):
+        raise HTTPException(status_code=400, detail="Uploaded file must be a .zip archive")
+
+    suffix = Path(original_name).suffix or ".zip"
+    temp_path: Optional[str] = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            temp_path = tmp.name
+            content = await file.read()
+            tmp.write(content)
+
+        skills_manager = SkillsManager()
+        result = skills_manager.install_external_skill_for_agent(
+            config_name=workspace_id,
+            agent_id=agent_id,
+            source=temp_path,
+            skill_name=name,
+            activate=activate,
+        )
+    except (FileNotFoundError, ValueError) as exc:
+        raise HTTPException(status_code=400, detail=str(exc))
+    finally:
+        try:
+            await file.close()
+        except Exception as e:
+            logger.warning(f"Failed to close uploaded file: {e}")
+        if temp_path and os.path.exists(temp_path):
+            os.remove(temp_path)
+
+    return {
+        "message": f"Uploaded and installed external skill '{result['skill_name']}' for '{agent_id}'",
+        **result,
+    }
+
+
@router.get("/{agent_id}/files/{filename}", response_model=AgentFileResponse)
 async def get_agent_file(
    workspace_id: str,
--- a/backend/api/runtime.py
+++ b/backend/api/runtime.py
@@ -1,19 +1,25 @@
 # -*- coding: utf-8 -*-
-"""Runtime API routes exposing the latest trading run state."""
+"""Runtime API routes - Control Plane for managing Gateway processes."""

 from __future__ import annotations

 import asyncio
 import json
+import logging
+import os
+import signal
+import subprocess
+import sys
 from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, List, Optional

+logger = logging.getLogger(__name__)
+
 from fastapi import APIRouter, HTTPException, BackgroundTasks
 from pydantic import BaseModel, Field

 from backend.runtime.agent_runtime import AgentRuntimeState
-from backend.runtime.context import TradingRunContext
 from backend.runtime.manager import TradingRuntimeManager, get_global_runtime_manager

 router = APIRouter(prefix="/api/runtime", tags=["runtime"])
@@ -21,9 +27,9 @@ router = APIRouter(prefix="/api/runtime", tags=["runtime"])
 runtime_manager: Optional[TradingRuntimeManager] = None
 PROJECT_ROOT = Path(__file__).resolve().parents[2]

-# Global task reference for running pipeline
-_running_task: Optional[asyncio.Task] = None
-_stop_event: Optional[asyncio.Event] = None
+# Gateway process management
+_gateway_process: Optional[subprocess.Popen] = None
+_gateway_port: int = 8765


 class RunContextResponse(BaseModel):
@@ -67,12 +73,15 @@ class LaunchConfig(BaseModel):
    mode: str = Field(default="live", description="运行模式: live, backtest")
    start_date: Optional[str] = Field(default=None, description="回测开始日期 YYYY-MM-DD")
    end_date: Optional[str] = Field(default=None, description="回测结束日期 YYYY-MM-DD")
+    poll_interval: int = Field(default=10, ge=1, le=300, description="市场数据轮询间隔(秒)")
+    enable_mock: bool = Field(default=False, description="是否启用模拟模式(使用模拟价格数据)")


 class LaunchResponse(BaseModel):
    run_id: str
    status: str
    run_dir: str
+    gateway_port: int
    message: str


@@ -81,10 +90,10 @@ class StopResponse(BaseModel):
    message: str


-class RestartResponse(BaseModel):
-    run_id: str
-    status: str
-    message: str
+class GatewayStatusResponse(BaseModel):
+    is_running: bool
+    port: int
+    run_id: Optional[str] = None


 def _generate_run_id() -> str:
@@ -97,44 +106,92 @@ def _get_run_dir(run_id: str) -> Path:
    return PROJECT_ROOT / "runs" / run_id


-def _latest_snapshot_path() -> Optional[Path]:
-    candidates = sorted(
-        PROJECT_ROOT.glob("runs/*/state/runtime_state.json"),
-        key=lambda path: path.stat().st_mtime,
-        reverse=True,
+def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int:
+    """Find an available port for Gateway."""
+    import socket
+    for port in range(start_port, max_port):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            if s.connect_ex(('localhost', port)) != 0:
+                return port
+    raise RuntimeError("No available port found")
+
+
+def _is_gateway_running() -> bool:
+    """Check if Gateway process is running."""
+    global _gateway_process
+    if _gateway_process is None:
+        return False
+    return _gateway_process.poll() is None
+
+
+def _stop_gateway() -> bool:
+    """Stop the Gateway process."""
+    global _gateway_process
+    if _gateway_process is None:
+        return False
+
+    try:
+        # Try graceful shutdown first
+        _gateway_process.terminate()
+        try:
+            _gateway_process.wait(timeout=5)
+        except subprocess.TimeoutExpired:
+            # Force kill if graceful shutdown fails
+            _gateway_process.kill()
+            _gateway_process.wait()
+    except Exception as e:
+        logger.warning(f"Error during gateway shutdown: {e}")
+    finally:
+        _gateway_process = None
+
+    return True
+
+
+def _start_gateway_process(
+    run_id: str,
+    run_dir: Path,
+    bootstrap: Dict[str, Any],
+    port: int
+) -> subprocess.Popen:
+    """Start Gateway as a separate process."""
+    # Prepare environment
+    env = os.environ.copy()
+
+    # Create command arguments
+    cmd = [
+        sys.executable,
+        "-m", "backend.gateway_server",
+        "--run-id", run_id,
+        "--run-dir", str(run_dir),
+        "--port", str(port),
+        "--bootstrap", json.dumps(bootstrap)
+    ]
+
+    # Start process
+    process = subprocess.Popen(
+        cmd,
+        env=env,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        cwd=PROJECT_ROOT
    )
-    return candidates[0] if candidates else None

-
-def _load_snapshot() -> Dict[str, Any]:
-    snapshot_path = _latest_snapshot_path()
-    if snapshot_path is None or not snapshot_path.exists():
-        raise HTTPException(status_code=503, detail="runtime manager is not initialized")
-    return json.loads(snapshot_path.read_text(encoding="utf-8"))
-
-
-def _get_runtime_payload() -> Dict[str, Any]:
-    if runtime_manager is not None:
-        return runtime_manager.build_snapshot()
-    return _load_snapshot()
-
-
-def _to_state_response(state: AgentRuntimeState) -> RuntimeAgentState:
-    return RuntimeAgentState(
-        agent_id=state.agent_id,
-        status=state.status,
-        last_session=state.last_session,
-        last_updated=state.last_updated.isoformat(),
-    )
+    return process


@router.get("/context", response_model=RunContextResponse)
 async def get_run_context() -> RunContextResponse:
    """Return the most recent run context."""
-    payload = _get_runtime_payload()
-    context = payload.get("context")
+    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
+    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
+
+    if not snapshots:
+        raise HTTPException(status_code=404, detail="No run context available")
+
+    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
+    context = latest.get("context")
    if context is None:
-        raise HTTPException(status_code=404, detail="run context is not ready")
+        raise HTTPException(status_code=404, detail="Run context is not ready")

    return RunContextResponse(
        config_name=context["config_name"],
@@ -144,88 +201,74 @@ async def get_run_context() -> RunContextResponse:


@router.get("/agents", response_model=RuntimeAgentsResponse)
-async def list_agent_states() -> RuntimeAgentsResponse:
-    """List the current runtime state of every registered agent."""
-    payload = _get_runtime_payload()
-    agents = [RuntimeAgentState(**agent) for agent in payload.get("agents", [])]
-    return RuntimeAgentsResponse(agents=agents)
+async def get_runtime_agents() -> RuntimeAgentsResponse:
+    """Return agent states from the most recent run."""
+    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
+    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
+
+    if not snapshots:
+        raise HTTPException(status_code=404, detail="No runtime state available")
+
+    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
+    agents = latest.get("agents", [])
+
+    return RuntimeAgentsResponse(
+        agents=[RuntimeAgentState(**a) for a in agents]
+    )


@router.get("/events", response_model=RuntimeEventsResponse)
-async def list_runtime_events() -> RuntimeEventsResponse:
-    """Return the recent runtime events that TradingRuntimeManager emitted."""
-    payload = _get_runtime_payload()
-    events = [RuntimeEvent(**event) for event in payload.get("events", [])]
-    return RuntimeEventsResponse(events=events)
+async def get_runtime_events() -> RuntimeEventsResponse:
+    """Return events from the most recent run."""
+    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
+    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)

+    if not snapshots:
+        raise HTTPException(status_code=404, detail="No runtime state available")

-@router.get("/agents/{agent_id}", response_model=RuntimeAgentState)
-async def get_agent_state(agent_id: str) -> RuntimeAgentState:
-    """Return the current runtime state for a single agent."""
-    payload = _get_runtime_payload()
-    state = next(
-        (agent for agent in payload.get("agents", []) if agent["agent_id"] == agent_id),
-        None,
+    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
+    events = latest.get("events", [])
+
+    return RuntimeEventsResponse(
+        events=[RuntimeEvent(**e) for e in events]
    )
-    if state is None:
-        raise HTTPException(status_code=404, detail=f"agent '{agent_id}' not registered")
-    return RuntimeAgentState(**state)


-def register_runtime_manager(manager: TradingRuntimeManager) -> None:
-    """Allow other modules to expose the runtime manager to the API."""
-    global runtime_manager
-    runtime_manager = manager
+@router.get("/gateway/status", response_model=GatewayStatusResponse)
+async def get_gateway_status() -> GatewayStatusResponse:
+    """Get Gateway process status and port."""
+    global _gateway_port

+    is_running = _is_gateway_running()
+    run_id = None

-def unregister_runtime_manager() -> None:
-    """Drop the runtime manager reference (used for shutdown/testing)."""
-    global runtime_manager
-    runtime_manager = None
-
-
-async def _stop_current_runtime(force: bool = True) -> bool:
-    """Stop the current running runtime if exists.
-
-    Args:
-        force: If True, cancel the running task immediately
-
-    Returns:
-        True if a runtime was stopped, False if no runtime was running
-    """
-    global _running_task, _stop_event
-
-    # Signal stop
-    if _stop_event is not None:
-        _stop_event.set()
-
-    # Cancel running task
-    if _running_task is not None and not _running_task.done():
-        if force:
-            _running_task.cancel()
+    if is_running:
+        # Try to find run_id from runtime state
+        snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
+        snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
+        if snapshots:
            try:
-                await _running_task
-            except asyncio.CancelledError:
-                pass
-        else:
-            # Wait for graceful shutdown
-            try:
-                await asyncio.wait_for(_running_task, timeout=30.0)
-            except asyncio.TimeoutError:
-                _running_task.cancel()
-                try:
-                    await _running_task
-                except asyncio.CancelledError:
-                    pass
+                latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
+                run_id = latest.get("context", {}).get("config_name")
+            except Exception as e:
+                logger.warning(f"Failed to parse latest snapshot: {e}")

-    _running_task = None
-    _stop_event = None
+    return GatewayStatusResponse(
+        is_running=is_running,
+        port=_gateway_port,
+        run_id=run_id
+    )

-    # Unregister runtime manager
-    if runtime_manager is not None:
-        unregister_runtime_manager()

-    return True
+@router.get("/gateway/port")
+async def get_gateway_port() -> Dict[str, Any]:
+    """Get WebSocket Gateway port for frontend connection."""
+    global _gateway_port
+    return {
+        "port": _gateway_port,
+        "is_running": _is_gateway_running(),
+        "ws_url": f"ws://localhost:{_gateway_port}"
+    }


@router.post("/start", response_model=LaunchResponse)
@@ -235,13 +278,18 @@ async def start_runtime(
 ) -> LaunchResponse:
    """Start a new trading runtime with the given configuration.

-    If a runtime is already running, it will be forcefully stopped first.
-    Creates a new timestamped run directory.
+    1. Stop existing Gateway if running
+    2. Generate run ID and directory
+    3. Create runtime manager
+    4. Start Gateway as subprocess (Data Plane)
+    5. Return Gateway port for WebSocket connection
    """
-    global _running_task, _stop_event, runtime_manager
+    global _gateway_process, _gateway_port

-    # 1. Stop current runtime if exists
-    await _stop_current_runtime(force=True)
+    # 1. Stop existing Gateway
+    if _is_gateway_running():
+        _stop_gateway()
+        await asyncio.sleep(1)  # Wait for port release

    # 2. Generate run ID and directory
    run_id = _generate_run_id()
@@ -260,92 +308,136 @@ async def start_runtime(
        "mode": config.mode,
        "start_date": config.start_date,
        "end_date": config.end_date,
+        "poll_interval": config.poll_interval,
+        "enable_mock": config.enable_mock,
    }

-    # 4. Create and prepare runtime manager
-    runtime_manager = TradingRuntimeManager(
+    # 4. Create runtime manager
+    manager = TradingRuntimeManager(
        config_name=run_id,
        run_dir=run_dir,
        bootstrap=bootstrap,
    )
-    runtime_manager.prepare_run()
-    set_global_runtime_manager = None  # Will be set by main module
+    manager.prepare_run()
+    register_runtime_manager(manager)

    # 5. Write BOOTSTRAP.md
    _write_bootstrap_md(run_dir, bootstrap)

-    # 6. Start pipeline in background
-    _stop_event = asyncio.Event()
-    _running_task = asyncio.create_task(
-        _run_pipeline(run_id, run_dir, bootstrap, _stop_event)
-    )
+    # 6. Find available port and start Gateway process
+    _gateway_port = _find_available_port(start_port=8765)
+
+    try:
+        _gateway_process = _start_gateway_process(
+            run_id=run_id,
+            run_dir=run_dir,
+            bootstrap=bootstrap,
+            port=_gateway_port
+        )
+
+        # Wait briefly to check if process started successfully
+        await asyncio.sleep(2)
+
+        if not _is_gateway_running():
+            stdout, stderr = _gateway_process.communicate(timeout=1)
+            _gateway_process = None
+            raise HTTPException(
+                status_code=500,
+                detail=f"Gateway failed to start: {stderr.decode() if stderr else 'Unknown error'}"
+            )
+
+    except Exception as e:
+        _stop_gateway()
+        raise HTTPException(status_code=500, detail=f"Failed to start Gateway: {str(e)}")

    return LaunchResponse(
        run_id=run_id,
        status="started",
        run_dir=str(run_dir),
-        message=f"Runtime started with run_id: {run_id}",
+        gateway_port=_gateway_port,
+        message=f"Runtime started with run_id: {run_id}, Gateway on port: {_gateway_port}",
    )


@router.post("/stop", response_model=StopResponse)
 async def stop_runtime(force: bool = True) -> StopResponse:
-    """Stop the current running runtime.
+    """Stop the current running runtime."""
+    global _gateway_process

-    Args:
-        force: If True, forcefully cancel the running task
-    """
-    was_running = await _stop_current_runtime(force=force)
+    was_running = _is_gateway_running()

    if not was_running:
        raise HTTPException(status_code=404, detail="No runtime is currently running")

+    # Stop Gateway process
+    _stop_gateway()
+
+    # Unregister runtime manager
+    unregister_runtime_manager()
+
    return StopResponse(
        status="stopped",
        message="Runtime stopped successfully",
    )


-@router.post("/restart", response_model=RestartResponse)
+@router.post("/restart")
 async def restart_runtime(
    config: LaunchConfig,
    background_tasks: BackgroundTasks
-) -> RestartResponse:
-    """Restart the runtime with a new configuration.
-
-    Equivalent to stop + start.
-    """
+):
+    """Restart the runtime with a new configuration."""
    # Stop current runtime
-    await _stop_current_runtime(force=True)
+    await stop_runtime(force=True)

    # Start new runtime
    response = await start_runtime(config, background_tasks)

-    return RestartResponse(
-        run_id=response.run_id,
-        status="restarted",
-        message=f"Runtime restarted with run_id: {response.run_id}",
-    )
+    return {
+        "run_id": response.run_id,
+        "status": "restarted",
+        "gateway_port": response.gateway_port,
+        "message": f"Runtime restarted with run_id: {response.run_id}",
+    }


@router.get("/current")
 async def get_current_runtime():
    """Get information about the currently running runtime."""
-    global _running_task, runtime_manager
-
-    is_running = _running_task is not None and not _running_task.done()
-
-    if not is_running or runtime_manager is None:
+    if not _is_gateway_running():
        raise HTTPException(status_code=404, detail="No runtime is currently running")

+    # Find latest runtime state
+    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
+    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
+
+    if not snapshots:
+        raise HTTPException(status_code=404, detail="No runtime information available")
+
+    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
+    context = latest.get("context", {})
+
    return {
-        "run_id": runtime_manager.config_name,
-        "run_dir": str(runtime_manager.run_dir),
-        "is_running": is_running,
-        "bootstrap": runtime_manager.bootstrap,
+        "run_id": context.get("config_name"),
+        "run_dir": context.get("run_dir"),
+        "is_running": True,
+        "gateway_port": _gateway_port,
+        "bootstrap": context.get("bootstrap_values", {}),
    }


+def register_runtime_manager(manager: TradingRuntimeManager) -> None:
+    """Allow other modules to expose the runtime manager to the API."""
+    global runtime_manager
+    runtime_manager = manager
+
+
+def unregister_runtime_manager() -> None:
+    """Drop the runtime manager reference."""
+    global runtime_manager
+    runtime_manager = None
+
+
 def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
    """Write bootstrap configuration to BOOTSTRAP.md."""
    try:
@@ -362,38 +454,7 @@ def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
    if yaml:
        front_matter = yaml.safe_dump(values, allow_unicode=True, sort_keys=False)
    else:
-        # Fallback to JSON if yaml not available
        front_matter = json.dumps(values, ensure_ascii=False, indent=2)

    content = f"---\n{front_matter}---\n"
    bootstrap_path.write_text(content, encoding="utf-8")
-
-
-async def _run_pipeline(
-    run_id: str,
-    run_dir: Path,
-    bootstrap: Dict[str, Any],
-    stop_event: asyncio.Event
-) -> None:
-    """Background task to run the trading pipeline."""
-    import logging
-    logger = logging.getLogger(__name__)
-
-    from backend.core.pipeline_runner import run_pipeline
-
-    try:
-        logger.info(f"Starting pipeline for run_id: {run_id}")
-        await run_pipeline(
-            run_id=run_id,
-            run_dir=run_dir,
-            bootstrap=bootstrap,
-            stop_event=stop_event,
-        )
-        logger.info(f"Pipeline completed for run_id: {run_id}")
-    except asyncio.CancelledError:
-        logger.info(f"Pipeline cancelled for run_id: {run_id}")
-        raise
-    except Exception as e:
-        logger.exception(f"Pipeline failed for run_id: {run_id}: {e}")
-        # Re-raise to allow proper cleanup
-        raise