feat: Add evaluation hooks, skill adaptation and team pipeline config

- Add EvaluationHook for post-execution agent evaluation
- Add SkillAdaptationHook for dynamic skill adaptation
- Add team/ directory with team coordination logic
- Add TEAM_PIPELINE.yaml for smoke_fullstack pipeline config
- Update RuntimeView, TraderView and RuntimeSettingsPanel UI
- Add runtimeApi and websocket services
- Add runtime_state.json to smoke_fullstack state

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-19 18:52:12 +08:00
parent f4a2b7f3af
commit 4b5ac86b83
87 changed files with 5042 additions and 744 deletions

View File

@@ -4,15 +4,20 @@ Agent API Routes
Provides REST API endpoints for agent management within workspaces.
"""
from typing import Any, Dict, List, Optional
import logging
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, List, Optional
from fastapi import APIRouter, HTTPException, Depends, Body
from fastapi import APIRouter, HTTPException, Depends, Body, UploadFile, File, Form
from pydantic import BaseModel, Field
from backend.agents import AgentFactory, WorkspaceManager, get_registry
from backend.agents.skills_manager import SkillsManager
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/workspaces/{workspace_id}/agents", tags=["agents"])
@@ -35,6 +40,13 @@ class UpdateAgentRequest(BaseModel):
disabled_skills: Optional[List[str]] = None
class InstallExternalSkillRequest(BaseModel):
"""Request to install an external skill for one agent."""
source: str = Field(..., description="Directory path, zip path, or http(s) zip URL")
name: Optional[str] = Field(None, description="Optional override skill name")
activate: bool = Field(True, description="Whether to enable skill immediately")
class AgentResponse(BaseModel):
"""Agent information response."""
agent_id: str
@@ -344,6 +356,86 @@ async def disable_skill(
}
@router.post("/{agent_id}/skills/install")
async def install_external_skill(
workspace_id: str,
agent_id: str,
request: InstallExternalSkillRequest,
registry=Depends(get_registry),
):
"""Install an external skill into one agent's local skills."""
agent_info = registry.get(agent_id)
if not agent_info or agent_info.workspace_id != workspace_id:
raise HTTPException(status_code=404, detail=f"Agent '{agent_id}' not found")
skills_manager = SkillsManager()
try:
result = skills_manager.install_external_skill_for_agent(
config_name=workspace_id,
agent_id=agent_id,
source=request.source,
skill_name=request.name,
activate=request.activate,
)
except (FileNotFoundError, ValueError) as exc:
raise HTTPException(status_code=400, detail=str(exc))
return {
"message": f"Installed external skill '{result['skill_name']}' for '{agent_id}'",
**result,
}
@router.post("/{agent_id}/skills/upload")
async def upload_external_skill(
workspace_id: str,
agent_id: str,
file: UploadFile = File(...),
name: Optional[str] = Form(None),
activate: bool = Form(True),
registry=Depends(get_registry),
):
"""Upload a zip skill package from frontend and install for one agent."""
agent_info = registry.get(agent_id)
if not agent_info or agent_info.workspace_id != workspace_id:
raise HTTPException(status_code=404, detail=f"Agent '{agent_id}' not found")
original_name = (file.filename or "").strip()
if not original_name.lower().endswith(".zip"):
raise HTTPException(status_code=400, detail="Uploaded file must be a .zip archive")
suffix = Path(original_name).suffix or ".zip"
temp_path: Optional[str] = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
temp_path = tmp.name
content = await file.read()
tmp.write(content)
skills_manager = SkillsManager()
result = skills_manager.install_external_skill_for_agent(
config_name=workspace_id,
agent_id=agent_id,
source=temp_path,
skill_name=name,
activate=activate,
)
except (FileNotFoundError, ValueError) as exc:
raise HTTPException(status_code=400, detail=str(exc))
finally:
try:
await file.close()
except Exception as e:
logger.warning(f"Failed to close uploaded file: {e}")
if temp_path and os.path.exists(temp_path):
os.remove(temp_path)
return {
"message": f"Uploaded and installed external skill '{result['skill_name']}' for '{agent_id}'",
**result,
}
@router.get("/{agent_id}/files/{filename}", response_model=AgentFileResponse)
async def get_agent_file(
workspace_id: str,

View File

@@ -1,19 +1,25 @@
# -*- coding: utf-8 -*-
"""Runtime API routes exposing the latest trading run state."""
"""Runtime API routes - Control Plane for managing Gateway processes."""
from __future__ import annotations
import asyncio
import json
import logging
import os
import signal
import subprocess
import sys
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
from fastapi import APIRouter, HTTPException, BackgroundTasks
from pydantic import BaseModel, Field
from backend.runtime.agent_runtime import AgentRuntimeState
from backend.runtime.context import TradingRunContext
from backend.runtime.manager import TradingRuntimeManager, get_global_runtime_manager
router = APIRouter(prefix="/api/runtime", tags=["runtime"])
@@ -21,9 +27,9 @@ router = APIRouter(prefix="/api/runtime", tags=["runtime"])
runtime_manager: Optional[TradingRuntimeManager] = None
PROJECT_ROOT = Path(__file__).resolve().parents[2]
# Global task reference for running pipeline
_running_task: Optional[asyncio.Task] = None
_stop_event: Optional[asyncio.Event] = None
# Gateway process management
_gateway_process: Optional[subprocess.Popen] = None
_gateway_port: int = 8765
class RunContextResponse(BaseModel):
@@ -67,12 +73,15 @@ class LaunchConfig(BaseModel):
mode: str = Field(default="live", description="运行模式: live, backtest")
start_date: Optional[str] = Field(default=None, description="回测开始日期 YYYY-MM-DD")
end_date: Optional[str] = Field(default=None, description="回测结束日期 YYYY-MM-DD")
poll_interval: int = Field(default=10, ge=1, le=300, description="市场数据轮询间隔(秒)")
enable_mock: bool = Field(default=False, description="是否启用模拟模式(使用模拟价格数据)")
class LaunchResponse(BaseModel):
run_id: str
status: str
run_dir: str
gateway_port: int
message: str
@@ -81,10 +90,10 @@ class StopResponse(BaseModel):
message: str
class RestartResponse(BaseModel):
run_id: str
status: str
message: str
class GatewayStatusResponse(BaseModel):
is_running: bool
port: int
run_id: Optional[str] = None
def _generate_run_id() -> str:
@@ -97,44 +106,92 @@ def _get_run_dir(run_id: str) -> Path:
return PROJECT_ROOT / "runs" / run_id
def _latest_snapshot_path() -> Optional[Path]:
candidates = sorted(
PROJECT_ROOT.glob("runs/*/state/runtime_state.json"),
key=lambda path: path.stat().st_mtime,
reverse=True,
def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int:
"""Find an available port for Gateway."""
import socket
for port in range(start_port, max_port):
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
if s.connect_ex(('localhost', port)) != 0:
return port
raise RuntimeError("No available port found")
def _is_gateway_running() -> bool:
"""Check if Gateway process is running."""
global _gateway_process
if _gateway_process is None:
return False
return _gateway_process.poll() is None
def _stop_gateway() -> bool:
"""Stop the Gateway process."""
global _gateway_process
if _gateway_process is None:
return False
try:
# Try graceful shutdown first
_gateway_process.terminate()
try:
_gateway_process.wait(timeout=5)
except subprocess.TimeoutExpired:
# Force kill if graceful shutdown fails
_gateway_process.kill()
_gateway_process.wait()
except Exception as e:
logger.warning(f"Error during gateway shutdown: {e}")
finally:
_gateway_process = None
return True
def _start_gateway_process(
run_id: str,
run_dir: Path,
bootstrap: Dict[str, Any],
port: int
) -> subprocess.Popen:
"""Start Gateway as a separate process."""
# Prepare environment
env = os.environ.copy()
# Create command arguments
cmd = [
sys.executable,
"-m", "backend.gateway_server",
"--run-id", run_id,
"--run-dir", str(run_dir),
"--port", str(port),
"--bootstrap", json.dumps(bootstrap)
]
# Start process
process = subprocess.Popen(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=PROJECT_ROOT
)
return candidates[0] if candidates else None
def _load_snapshot() -> Dict[str, Any]:
snapshot_path = _latest_snapshot_path()
if snapshot_path is None or not snapshot_path.exists():
raise HTTPException(status_code=503, detail="runtime manager is not initialized")
return json.loads(snapshot_path.read_text(encoding="utf-8"))
def _get_runtime_payload() -> Dict[str, Any]:
if runtime_manager is not None:
return runtime_manager.build_snapshot()
return _load_snapshot()
def _to_state_response(state: AgentRuntimeState) -> RuntimeAgentState:
return RuntimeAgentState(
agent_id=state.agent_id,
status=state.status,
last_session=state.last_session,
last_updated=state.last_updated.isoformat(),
)
return process
@router.get("/context", response_model=RunContextResponse)
async def get_run_context() -> RunContextResponse:
"""Return the most recent run context."""
payload = _get_runtime_payload()
context = payload.get("context")
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No run context available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
context = latest.get("context")
if context is None:
raise HTTPException(status_code=404, detail="run context is not ready")
raise HTTPException(status_code=404, detail="Run context is not ready")
return RunContextResponse(
config_name=context["config_name"],
@@ -144,88 +201,74 @@ async def get_run_context() -> RunContextResponse:
@router.get("/agents", response_model=RuntimeAgentsResponse)
async def list_agent_states() -> RuntimeAgentsResponse:
"""List the current runtime state of every registered agent."""
payload = _get_runtime_payload()
agents = [RuntimeAgentState(**agent) for agent in payload.get("agents", [])]
return RuntimeAgentsResponse(agents=agents)
async def get_runtime_agents() -> RuntimeAgentsResponse:
"""Return agent states from the most recent run."""
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime state available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
agents = latest.get("agents", [])
return RuntimeAgentsResponse(
agents=[RuntimeAgentState(**a) for a in agents]
)
@router.get("/events", response_model=RuntimeEventsResponse)
async def list_runtime_events() -> RuntimeEventsResponse:
"""Return the recent runtime events that TradingRuntimeManager emitted."""
payload = _get_runtime_payload()
events = [RuntimeEvent(**event) for event in payload.get("events", [])]
return RuntimeEventsResponse(events=events)
async def get_runtime_events() -> RuntimeEventsResponse:
"""Return events from the most recent run."""
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime state available")
@router.get("/agents/{agent_id}", response_model=RuntimeAgentState)
async def get_agent_state(agent_id: str) -> RuntimeAgentState:
"""Return the current runtime state for a single agent."""
payload = _get_runtime_payload()
state = next(
(agent for agent in payload.get("agents", []) if agent["agent_id"] == agent_id),
None,
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
events = latest.get("events", [])
return RuntimeEventsResponse(
events=[RuntimeEvent(**e) for e in events]
)
if state is None:
raise HTTPException(status_code=404, detail=f"agent '{agent_id}' not registered")
return RuntimeAgentState(**state)
def register_runtime_manager(manager: TradingRuntimeManager) -> None:
"""Allow other modules to expose the runtime manager to the API."""
global runtime_manager
runtime_manager = manager
@router.get("/gateway/status", response_model=GatewayStatusResponse)
async def get_gateway_status() -> GatewayStatusResponse:
"""Get Gateway process status and port."""
global _gateway_port
is_running = _is_gateway_running()
run_id = None
def unregister_runtime_manager() -> None:
"""Drop the runtime manager reference (used for shutdown/testing)."""
global runtime_manager
runtime_manager = None
async def _stop_current_runtime(force: bool = True) -> bool:
"""Stop the current running runtime if exists.
Args:
force: If True, cancel the running task immediately
Returns:
True if a runtime was stopped, False if no runtime was running
"""
global _running_task, _stop_event
# Signal stop
if _stop_event is not None:
_stop_event.set()
# Cancel running task
if _running_task is not None and not _running_task.done():
if force:
_running_task.cancel()
if is_running:
# Try to find run_id from runtime state
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if snapshots:
try:
await _running_task
except asyncio.CancelledError:
pass
else:
# Wait for graceful shutdown
try:
await asyncio.wait_for(_running_task, timeout=30.0)
except asyncio.TimeoutError:
_running_task.cancel()
try:
await _running_task
except asyncio.CancelledError:
pass
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
run_id = latest.get("context", {}).get("config_name")
except Exception as e:
logger.warning(f"Failed to parse latest snapshot: {e}")
_running_task = None
_stop_event = None
return GatewayStatusResponse(
is_running=is_running,
port=_gateway_port,
run_id=run_id
)
# Unregister runtime manager
if runtime_manager is not None:
unregister_runtime_manager()
return True
@router.get("/gateway/port")
async def get_gateway_port() -> Dict[str, Any]:
"""Get WebSocket Gateway port for frontend connection."""
global _gateway_port
return {
"port": _gateway_port,
"is_running": _is_gateway_running(),
"ws_url": f"ws://localhost:{_gateway_port}"
}
@router.post("/start", response_model=LaunchResponse)
@@ -235,13 +278,18 @@ async def start_runtime(
) -> LaunchResponse:
"""Start a new trading runtime with the given configuration.
If a runtime is already running, it will be forcefully stopped first.
Creates a new timestamped run directory.
1. Stop existing Gateway if running
2. Generate run ID and directory
3. Create runtime manager
4. Start Gateway as subprocess (Data Plane)
5. Return Gateway port for WebSocket connection
"""
global _running_task, _stop_event, runtime_manager
global _gateway_process, _gateway_port
# 1. Stop current runtime if exists
await _stop_current_runtime(force=True)
# 1. Stop existing Gateway
if _is_gateway_running():
_stop_gateway()
await asyncio.sleep(1) # Wait for port release
# 2. Generate run ID and directory
run_id = _generate_run_id()
@@ -260,92 +308,136 @@ async def start_runtime(
"mode": config.mode,
"start_date": config.start_date,
"end_date": config.end_date,
"poll_interval": config.poll_interval,
"enable_mock": config.enable_mock,
}
# 4. Create and prepare runtime manager
runtime_manager = TradingRuntimeManager(
# 4. Create runtime manager
manager = TradingRuntimeManager(
config_name=run_id,
run_dir=run_dir,
bootstrap=bootstrap,
)
runtime_manager.prepare_run()
set_global_runtime_manager = None # Will be set by main module
manager.prepare_run()
register_runtime_manager(manager)
# 5. Write BOOTSTRAP.md
_write_bootstrap_md(run_dir, bootstrap)
# 6. Start pipeline in background
_stop_event = asyncio.Event()
_running_task = asyncio.create_task(
_run_pipeline(run_id, run_dir, bootstrap, _stop_event)
)
# 6. Find available port and start Gateway process
_gateway_port = _find_available_port(start_port=8765)
try:
_gateway_process = _start_gateway_process(
run_id=run_id,
run_dir=run_dir,
bootstrap=bootstrap,
port=_gateway_port
)
# Wait briefly to check if process started successfully
await asyncio.sleep(2)
if not _is_gateway_running():
stdout, stderr = _gateway_process.communicate(timeout=1)
_gateway_process = None
raise HTTPException(
status_code=500,
detail=f"Gateway failed to start: {stderr.decode() if stderr else 'Unknown error'}"
)
except Exception as e:
_stop_gateway()
raise HTTPException(status_code=500, detail=f"Failed to start Gateway: {str(e)}")
return LaunchResponse(
run_id=run_id,
status="started",
run_dir=str(run_dir),
message=f"Runtime started with run_id: {run_id}",
gateway_port=_gateway_port,
message=f"Runtime started with run_id: {run_id}, Gateway on port: {_gateway_port}",
)
@router.post("/stop", response_model=StopResponse)
async def stop_runtime(force: bool = True) -> StopResponse:
"""Stop the current running runtime.
"""Stop the current running runtime."""
global _gateway_process
Args:
force: If True, forcefully cancel the running task
"""
was_running = await _stop_current_runtime(force=force)
was_running = _is_gateway_running()
if not was_running:
raise HTTPException(status_code=404, detail="No runtime is currently running")
# Stop Gateway process
_stop_gateway()
# Unregister runtime manager
unregister_runtime_manager()
return StopResponse(
status="stopped",
message="Runtime stopped successfully",
)
@router.post("/restart", response_model=RestartResponse)
@router.post("/restart")
async def restart_runtime(
config: LaunchConfig,
background_tasks: BackgroundTasks
) -> RestartResponse:
"""Restart the runtime with a new configuration.
Equivalent to stop + start.
"""
):
"""Restart the runtime with a new configuration."""
# Stop current runtime
await _stop_current_runtime(force=True)
await stop_runtime(force=True)
# Start new runtime
response = await start_runtime(config, background_tasks)
return RestartResponse(
run_id=response.run_id,
status="restarted",
message=f"Runtime restarted with run_id: {response.run_id}",
)
return {
"run_id": response.run_id,
"status": "restarted",
"gateway_port": response.gateway_port,
"message": f"Runtime restarted with run_id: {response.run_id}",
}
@router.get("/current")
async def get_current_runtime():
"""Get information about the currently running runtime."""
global _running_task, runtime_manager
is_running = _running_task is not None and not _running_task.done()
if not is_running or runtime_manager is None:
if not _is_gateway_running():
raise HTTPException(status_code=404, detail="No runtime is currently running")
# Find latest runtime state
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime information available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
context = latest.get("context", {})
return {
"run_id": runtime_manager.config_name,
"run_dir": str(runtime_manager.run_dir),
"is_running": is_running,
"bootstrap": runtime_manager.bootstrap,
"run_id": context.get("config_name"),
"run_dir": context.get("run_dir"),
"is_running": True,
"gateway_port": _gateway_port,
"bootstrap": context.get("bootstrap_values", {}),
}
def register_runtime_manager(manager: TradingRuntimeManager) -> None:
"""Allow other modules to expose the runtime manager to the API."""
global runtime_manager
runtime_manager = manager
def unregister_runtime_manager() -> None:
"""Drop the runtime manager reference."""
global runtime_manager
runtime_manager = None
def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
"""Write bootstrap configuration to BOOTSTRAP.md."""
try:
@@ -362,38 +454,7 @@ def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
if yaml:
front_matter = yaml.safe_dump(values, allow_unicode=True, sort_keys=False)
else:
# Fallback to JSON if yaml not available
front_matter = json.dumps(values, ensure_ascii=False, indent=2)
content = f"---\n{front_matter}---\n"
bootstrap_path.write_text(content, encoding="utf-8")
async def _run_pipeline(
run_id: str,
run_dir: Path,
bootstrap: Dict[str, Any],
stop_event: asyncio.Event
) -> None:
"""Background task to run the trading pipeline."""
import logging
logger = logging.getLogger(__name__)
from backend.core.pipeline_runner import run_pipeline
try:
logger.info(f"Starting pipeline for run_id: {run_id}")
await run_pipeline(
run_id=run_id,
run_dir=run_dir,
bootstrap=bootstrap,
stop_event=stop_event,
)
logger.info(f"Pipeline completed for run_id: {run_id}")
except asyncio.CancelledError:
logger.info(f"Pipeline cancelled for run_id: {run_id}")
raise
except Exception as e:
logger.exception(f"Pipeline failed for run_id: {run_id}: {e}")
# Re-raise to allow proper cleanup
raise