feat: Add evaluation hooks, skill adaptation and team pipeline config
- Add EvaluationHook for post-execution agent evaluation - Add SkillAdaptationHook for dynamic skill adaptation - Add team/ directory with team coordination logic - Add TEAM_PIPELINE.yaml for smoke_fullstack pipeline config - Update RuntimeView, TraderView and RuntimeSettingsPanel UI - Add runtimeApi and websocket services - Add runtime_state.json to smoke_fullstack state Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,19 +1,25 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Runtime API routes exposing the latest trading run state."""
|
||||
"""Runtime API routes - Control Plane for managing Gateway processes."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from fastapi import APIRouter, HTTPException, BackgroundTasks
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from backend.runtime.agent_runtime import AgentRuntimeState
|
||||
from backend.runtime.context import TradingRunContext
|
||||
from backend.runtime.manager import TradingRuntimeManager, get_global_runtime_manager
|
||||
|
||||
router = APIRouter(prefix="/api/runtime", tags=["runtime"])
|
||||
@@ -21,9 +27,9 @@ router = APIRouter(prefix="/api/runtime", tags=["runtime"])
|
||||
runtime_manager: Optional[TradingRuntimeManager] = None
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
|
||||
# Global task reference for running pipeline
|
||||
_running_task: Optional[asyncio.Task] = None
|
||||
_stop_event: Optional[asyncio.Event] = None
|
||||
# Gateway process management
|
||||
_gateway_process: Optional[subprocess.Popen] = None
|
||||
_gateway_port: int = 8765
|
||||
|
||||
|
||||
class RunContextResponse(BaseModel):
|
||||
@@ -67,12 +73,15 @@ class LaunchConfig(BaseModel):
|
||||
mode: str = Field(default="live", description="运行模式: live, backtest")
|
||||
start_date: Optional[str] = Field(default=None, description="回测开始日期 YYYY-MM-DD")
|
||||
end_date: Optional[str] = Field(default=None, description="回测结束日期 YYYY-MM-DD")
|
||||
poll_interval: int = Field(default=10, ge=1, le=300, description="市场数据轮询间隔(秒)")
|
||||
enable_mock: bool = Field(default=False, description="是否启用模拟模式(使用模拟价格数据)")
|
||||
|
||||
|
||||
class LaunchResponse(BaseModel):
|
||||
run_id: str
|
||||
status: str
|
||||
run_dir: str
|
||||
gateway_port: int
|
||||
message: str
|
||||
|
||||
|
||||
@@ -81,10 +90,10 @@ class StopResponse(BaseModel):
|
||||
message: str
|
||||
|
||||
|
||||
class RestartResponse(BaseModel):
|
||||
run_id: str
|
||||
status: str
|
||||
message: str
|
||||
class GatewayStatusResponse(BaseModel):
|
||||
is_running: bool
|
||||
port: int
|
||||
run_id: Optional[str] = None
|
||||
|
||||
|
||||
def _generate_run_id() -> str:
|
||||
@@ -97,44 +106,92 @@ def _get_run_dir(run_id: str) -> Path:
|
||||
return PROJECT_ROOT / "runs" / run_id
|
||||
|
||||
|
||||
def _latest_snapshot_path() -> Optional[Path]:
|
||||
candidates = sorted(
|
||||
PROJECT_ROOT.glob("runs/*/state/runtime_state.json"),
|
||||
key=lambda path: path.stat().st_mtime,
|
||||
reverse=True,
|
||||
def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int:
|
||||
"""Find an available port for Gateway."""
|
||||
import socket
|
||||
for port in range(start_port, max_port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
if s.connect_ex(('localhost', port)) != 0:
|
||||
return port
|
||||
raise RuntimeError("No available port found")
|
||||
|
||||
|
||||
def _is_gateway_running() -> bool:
|
||||
"""Check if Gateway process is running."""
|
||||
global _gateway_process
|
||||
if _gateway_process is None:
|
||||
return False
|
||||
return _gateway_process.poll() is None
|
||||
|
||||
|
||||
def _stop_gateway() -> bool:
|
||||
"""Stop the Gateway process."""
|
||||
global _gateway_process
|
||||
if _gateway_process is None:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Try graceful shutdown first
|
||||
_gateway_process.terminate()
|
||||
try:
|
||||
_gateway_process.wait(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
# Force kill if graceful shutdown fails
|
||||
_gateway_process.kill()
|
||||
_gateway_process.wait()
|
||||
except Exception as e:
|
||||
logger.warning(f"Error during gateway shutdown: {e}")
|
||||
finally:
|
||||
_gateway_process = None
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _start_gateway_process(
|
||||
run_id: str,
|
||||
run_dir: Path,
|
||||
bootstrap: Dict[str, Any],
|
||||
port: int
|
||||
) -> subprocess.Popen:
|
||||
"""Start Gateway as a separate process."""
|
||||
# Prepare environment
|
||||
env = os.environ.copy()
|
||||
|
||||
# Create command arguments
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m", "backend.gateway_server",
|
||||
"--run-id", run_id,
|
||||
"--run-dir", str(run_dir),
|
||||
"--port", str(port),
|
||||
"--bootstrap", json.dumps(bootstrap)
|
||||
]
|
||||
|
||||
# Start process
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
cwd=PROJECT_ROOT
|
||||
)
|
||||
return candidates[0] if candidates else None
|
||||
|
||||
|
||||
def _load_snapshot() -> Dict[str, Any]:
|
||||
snapshot_path = _latest_snapshot_path()
|
||||
if snapshot_path is None or not snapshot_path.exists():
|
||||
raise HTTPException(status_code=503, detail="runtime manager is not initialized")
|
||||
return json.loads(snapshot_path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def _get_runtime_payload() -> Dict[str, Any]:
|
||||
if runtime_manager is not None:
|
||||
return runtime_manager.build_snapshot()
|
||||
return _load_snapshot()
|
||||
|
||||
|
||||
def _to_state_response(state: AgentRuntimeState) -> RuntimeAgentState:
|
||||
return RuntimeAgentState(
|
||||
agent_id=state.agent_id,
|
||||
status=state.status,
|
||||
last_session=state.last_session,
|
||||
last_updated=state.last_updated.isoformat(),
|
||||
)
|
||||
return process
|
||||
|
||||
|
||||
@router.get("/context", response_model=RunContextResponse)
|
||||
async def get_run_context() -> RunContextResponse:
|
||||
"""Return the most recent run context."""
|
||||
payload = _get_runtime_payload()
|
||||
context = payload.get("context")
|
||||
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
|
||||
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
if not snapshots:
|
||||
raise HTTPException(status_code=404, detail="No run context available")
|
||||
|
||||
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
|
||||
context = latest.get("context")
|
||||
if context is None:
|
||||
raise HTTPException(status_code=404, detail="run context is not ready")
|
||||
raise HTTPException(status_code=404, detail="Run context is not ready")
|
||||
|
||||
return RunContextResponse(
|
||||
config_name=context["config_name"],
|
||||
@@ -144,88 +201,74 @@ async def get_run_context() -> RunContextResponse:
|
||||
|
||||
|
||||
@router.get("/agents", response_model=RuntimeAgentsResponse)
|
||||
async def list_agent_states() -> RuntimeAgentsResponse:
|
||||
"""List the current runtime state of every registered agent."""
|
||||
payload = _get_runtime_payload()
|
||||
agents = [RuntimeAgentState(**agent) for agent in payload.get("agents", [])]
|
||||
return RuntimeAgentsResponse(agents=agents)
|
||||
async def get_runtime_agents() -> RuntimeAgentsResponse:
|
||||
"""Return agent states from the most recent run."""
|
||||
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
|
||||
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
if not snapshots:
|
||||
raise HTTPException(status_code=404, detail="No runtime state available")
|
||||
|
||||
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
|
||||
agents = latest.get("agents", [])
|
||||
|
||||
return RuntimeAgentsResponse(
|
||||
agents=[RuntimeAgentState(**a) for a in agents]
|
||||
)
|
||||
|
||||
|
||||
@router.get("/events", response_model=RuntimeEventsResponse)
|
||||
async def list_runtime_events() -> RuntimeEventsResponse:
|
||||
"""Return the recent runtime events that TradingRuntimeManager emitted."""
|
||||
payload = _get_runtime_payload()
|
||||
events = [RuntimeEvent(**event) for event in payload.get("events", [])]
|
||||
return RuntimeEventsResponse(events=events)
|
||||
async def get_runtime_events() -> RuntimeEventsResponse:
|
||||
"""Return events from the most recent run."""
|
||||
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
|
||||
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
if not snapshots:
|
||||
raise HTTPException(status_code=404, detail="No runtime state available")
|
||||
|
||||
@router.get("/agents/{agent_id}", response_model=RuntimeAgentState)
|
||||
async def get_agent_state(agent_id: str) -> RuntimeAgentState:
|
||||
"""Return the current runtime state for a single agent."""
|
||||
payload = _get_runtime_payload()
|
||||
state = next(
|
||||
(agent for agent in payload.get("agents", []) if agent["agent_id"] == agent_id),
|
||||
None,
|
||||
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
|
||||
events = latest.get("events", [])
|
||||
|
||||
return RuntimeEventsResponse(
|
||||
events=[RuntimeEvent(**e) for e in events]
|
||||
)
|
||||
if state is None:
|
||||
raise HTTPException(status_code=404, detail=f"agent '{agent_id}' not registered")
|
||||
return RuntimeAgentState(**state)
|
||||
|
||||
|
||||
def register_runtime_manager(manager: TradingRuntimeManager) -> None:
|
||||
"""Allow other modules to expose the runtime manager to the API."""
|
||||
global runtime_manager
|
||||
runtime_manager = manager
|
||||
@router.get("/gateway/status", response_model=GatewayStatusResponse)
|
||||
async def get_gateway_status() -> GatewayStatusResponse:
|
||||
"""Get Gateway process status and port."""
|
||||
global _gateway_port
|
||||
|
||||
is_running = _is_gateway_running()
|
||||
run_id = None
|
||||
|
||||
def unregister_runtime_manager() -> None:
|
||||
"""Drop the runtime manager reference (used for shutdown/testing)."""
|
||||
global runtime_manager
|
||||
runtime_manager = None
|
||||
|
||||
|
||||
async def _stop_current_runtime(force: bool = True) -> bool:
|
||||
"""Stop the current running runtime if exists.
|
||||
|
||||
Args:
|
||||
force: If True, cancel the running task immediately
|
||||
|
||||
Returns:
|
||||
True if a runtime was stopped, False if no runtime was running
|
||||
"""
|
||||
global _running_task, _stop_event
|
||||
|
||||
# Signal stop
|
||||
if _stop_event is not None:
|
||||
_stop_event.set()
|
||||
|
||||
# Cancel running task
|
||||
if _running_task is not None and not _running_task.done():
|
||||
if force:
|
||||
_running_task.cancel()
|
||||
if is_running:
|
||||
# Try to find run_id from runtime state
|
||||
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
|
||||
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
if snapshots:
|
||||
try:
|
||||
await _running_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
else:
|
||||
# Wait for graceful shutdown
|
||||
try:
|
||||
await asyncio.wait_for(_running_task, timeout=30.0)
|
||||
except asyncio.TimeoutError:
|
||||
_running_task.cancel()
|
||||
try:
|
||||
await _running_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
|
||||
run_id = latest.get("context", {}).get("config_name")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse latest snapshot: {e}")
|
||||
|
||||
_running_task = None
|
||||
_stop_event = None
|
||||
return GatewayStatusResponse(
|
||||
is_running=is_running,
|
||||
port=_gateway_port,
|
||||
run_id=run_id
|
||||
)
|
||||
|
||||
# Unregister runtime manager
|
||||
if runtime_manager is not None:
|
||||
unregister_runtime_manager()
|
||||
|
||||
return True
|
||||
@router.get("/gateway/port")
|
||||
async def get_gateway_port() -> Dict[str, Any]:
|
||||
"""Get WebSocket Gateway port for frontend connection."""
|
||||
global _gateway_port
|
||||
return {
|
||||
"port": _gateway_port,
|
||||
"is_running": _is_gateway_running(),
|
||||
"ws_url": f"ws://localhost:{_gateway_port}"
|
||||
}
|
||||
|
||||
|
||||
@router.post("/start", response_model=LaunchResponse)
|
||||
@@ -235,13 +278,18 @@ async def start_runtime(
|
||||
) -> LaunchResponse:
|
||||
"""Start a new trading runtime with the given configuration.
|
||||
|
||||
If a runtime is already running, it will be forcefully stopped first.
|
||||
Creates a new timestamped run directory.
|
||||
1. Stop existing Gateway if running
|
||||
2. Generate run ID and directory
|
||||
3. Create runtime manager
|
||||
4. Start Gateway as subprocess (Data Plane)
|
||||
5. Return Gateway port for WebSocket connection
|
||||
"""
|
||||
global _running_task, _stop_event, runtime_manager
|
||||
global _gateway_process, _gateway_port
|
||||
|
||||
# 1. Stop current runtime if exists
|
||||
await _stop_current_runtime(force=True)
|
||||
# 1. Stop existing Gateway
|
||||
if _is_gateway_running():
|
||||
_stop_gateway()
|
||||
await asyncio.sleep(1) # Wait for port release
|
||||
|
||||
# 2. Generate run ID and directory
|
||||
run_id = _generate_run_id()
|
||||
@@ -260,92 +308,136 @@ async def start_runtime(
|
||||
"mode": config.mode,
|
||||
"start_date": config.start_date,
|
||||
"end_date": config.end_date,
|
||||
"poll_interval": config.poll_interval,
|
||||
"enable_mock": config.enable_mock,
|
||||
}
|
||||
|
||||
# 4. Create and prepare runtime manager
|
||||
runtime_manager = TradingRuntimeManager(
|
||||
# 4. Create runtime manager
|
||||
manager = TradingRuntimeManager(
|
||||
config_name=run_id,
|
||||
run_dir=run_dir,
|
||||
bootstrap=bootstrap,
|
||||
)
|
||||
runtime_manager.prepare_run()
|
||||
set_global_runtime_manager = None # Will be set by main module
|
||||
manager.prepare_run()
|
||||
register_runtime_manager(manager)
|
||||
|
||||
# 5. Write BOOTSTRAP.md
|
||||
_write_bootstrap_md(run_dir, bootstrap)
|
||||
|
||||
# 6. Start pipeline in background
|
||||
_stop_event = asyncio.Event()
|
||||
_running_task = asyncio.create_task(
|
||||
_run_pipeline(run_id, run_dir, bootstrap, _stop_event)
|
||||
)
|
||||
# 6. Find available port and start Gateway process
|
||||
_gateway_port = _find_available_port(start_port=8765)
|
||||
|
||||
try:
|
||||
_gateway_process = _start_gateway_process(
|
||||
run_id=run_id,
|
||||
run_dir=run_dir,
|
||||
bootstrap=bootstrap,
|
||||
port=_gateway_port
|
||||
)
|
||||
|
||||
# Wait briefly to check if process started successfully
|
||||
await asyncio.sleep(2)
|
||||
|
||||
if not _is_gateway_running():
|
||||
stdout, stderr = _gateway_process.communicate(timeout=1)
|
||||
_gateway_process = None
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"Gateway failed to start: {stderr.decode() if stderr else 'Unknown error'}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
_stop_gateway()
|
||||
raise HTTPException(status_code=500, detail=f"Failed to start Gateway: {str(e)}")
|
||||
|
||||
return LaunchResponse(
|
||||
run_id=run_id,
|
||||
status="started",
|
||||
run_dir=str(run_dir),
|
||||
message=f"Runtime started with run_id: {run_id}",
|
||||
gateway_port=_gateway_port,
|
||||
message=f"Runtime started with run_id: {run_id}, Gateway on port: {_gateway_port}",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/stop", response_model=StopResponse)
|
||||
async def stop_runtime(force: bool = True) -> StopResponse:
|
||||
"""Stop the current running runtime.
|
||||
"""Stop the current running runtime."""
|
||||
global _gateway_process
|
||||
|
||||
Args:
|
||||
force: If True, forcefully cancel the running task
|
||||
"""
|
||||
was_running = await _stop_current_runtime(force=force)
|
||||
was_running = _is_gateway_running()
|
||||
|
||||
if not was_running:
|
||||
raise HTTPException(status_code=404, detail="No runtime is currently running")
|
||||
|
||||
# Stop Gateway process
|
||||
_stop_gateway()
|
||||
|
||||
# Unregister runtime manager
|
||||
unregister_runtime_manager()
|
||||
|
||||
return StopResponse(
|
||||
status="stopped",
|
||||
message="Runtime stopped successfully",
|
||||
)
|
||||
|
||||
|
||||
@router.post("/restart", response_model=RestartResponse)
|
||||
@router.post("/restart")
|
||||
async def restart_runtime(
|
||||
config: LaunchConfig,
|
||||
background_tasks: BackgroundTasks
|
||||
) -> RestartResponse:
|
||||
"""Restart the runtime with a new configuration.
|
||||
|
||||
Equivalent to stop + start.
|
||||
"""
|
||||
):
|
||||
"""Restart the runtime with a new configuration."""
|
||||
# Stop current runtime
|
||||
await _stop_current_runtime(force=True)
|
||||
await stop_runtime(force=True)
|
||||
|
||||
# Start new runtime
|
||||
response = await start_runtime(config, background_tasks)
|
||||
|
||||
return RestartResponse(
|
||||
run_id=response.run_id,
|
||||
status="restarted",
|
||||
message=f"Runtime restarted with run_id: {response.run_id}",
|
||||
)
|
||||
return {
|
||||
"run_id": response.run_id,
|
||||
"status": "restarted",
|
||||
"gateway_port": response.gateway_port,
|
||||
"message": f"Runtime restarted with run_id: {response.run_id}",
|
||||
}
|
||||
|
||||
|
||||
@router.get("/current")
|
||||
async def get_current_runtime():
|
||||
"""Get information about the currently running runtime."""
|
||||
global _running_task, runtime_manager
|
||||
|
||||
is_running = _running_task is not None and not _running_task.done()
|
||||
|
||||
if not is_running or runtime_manager is None:
|
||||
if not _is_gateway_running():
|
||||
raise HTTPException(status_code=404, detail="No runtime is currently running")
|
||||
|
||||
# Find latest runtime state
|
||||
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
|
||||
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
|
||||
|
||||
if not snapshots:
|
||||
raise HTTPException(status_code=404, detail="No runtime information available")
|
||||
|
||||
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
|
||||
context = latest.get("context", {})
|
||||
|
||||
return {
|
||||
"run_id": runtime_manager.config_name,
|
||||
"run_dir": str(runtime_manager.run_dir),
|
||||
"is_running": is_running,
|
||||
"bootstrap": runtime_manager.bootstrap,
|
||||
"run_id": context.get("config_name"),
|
||||
"run_dir": context.get("run_dir"),
|
||||
"is_running": True,
|
||||
"gateway_port": _gateway_port,
|
||||
"bootstrap": context.get("bootstrap_values", {}),
|
||||
}
|
||||
|
||||
|
||||
def register_runtime_manager(manager: TradingRuntimeManager) -> None:
|
||||
"""Allow other modules to expose the runtime manager to the API."""
|
||||
global runtime_manager
|
||||
runtime_manager = manager
|
||||
|
||||
|
||||
def unregister_runtime_manager() -> None:
|
||||
"""Drop the runtime manager reference."""
|
||||
global runtime_manager
|
||||
runtime_manager = None
|
||||
|
||||
|
||||
def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
|
||||
"""Write bootstrap configuration to BOOTSTRAP.md."""
|
||||
try:
|
||||
@@ -362,38 +454,7 @@ def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None:
|
||||
if yaml:
|
||||
front_matter = yaml.safe_dump(values, allow_unicode=True, sort_keys=False)
|
||||
else:
|
||||
# Fallback to JSON if yaml not available
|
||||
front_matter = json.dumps(values, ensure_ascii=False, indent=2)
|
||||
|
||||
content = f"---\n{front_matter}---\n"
|
||||
bootstrap_path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
async def _run_pipeline(
|
||||
run_id: str,
|
||||
run_dir: Path,
|
||||
bootstrap: Dict[str, Any],
|
||||
stop_event: asyncio.Event
|
||||
) -> None:
|
||||
"""Background task to run the trading pipeline."""
|
||||
import logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from backend.core.pipeline_runner import run_pipeline
|
||||
|
||||
try:
|
||||
logger.info(f"Starting pipeline for run_id: {run_id}")
|
||||
await run_pipeline(
|
||||
run_id=run_id,
|
||||
run_dir=run_dir,
|
||||
bootstrap=bootstrap,
|
||||
stop_event=stop_event,
|
||||
)
|
||||
logger.info(f"Pipeline completed for run_id: {run_id}")
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"Pipeline cancelled for run_id: {run_id}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception(f"Pipeline failed for run_id: {run_id}: {e}")
|
||||
# Re-raise to allow proper cleanup
|
||||
raise
|
||||
|
||||
Reference in New Issue
Block a user