# -*- coding: utf-8 -*- """Runtime API routes - Control Plane for managing Gateway processes.""" from __future__ import annotations import asyncio import json import logging import os import re import shutil import subprocess import sys from datetime import datetime from pathlib import Path from typing import Any, Dict, List, Optional logger = logging.getLogger(__name__) from fastapi import APIRouter, BackgroundTasks, HTTPException, Request from pydantic import BaseModel, Field from backend.config.bootstrap_config import ( resolve_runtime_config, update_bootstrap_values_for_run, ) router = APIRouter(prefix="/api/runtime", tags=["runtime"]) PROJECT_ROOT = Path(__file__).resolve().parents[2] def _normalize_schedule_mode(value: Any) -> str: """Normalize schedule mode to the current public vocabulary. `intraday` is kept as a backward-compatible alias for `interval`. """ mode = str(value or "daily").strip().lower() if mode == "intraday": return "interval" return mode or "daily" class RuntimeState: """Thread-safe singleton for managing runtime state. Encapsulates runtime_manager, _gateway_process, and _gateway_port with asyncio.Lock protection for concurrent access. """ _instance: Optional["RuntimeState"] = None _lock: "threading.Lock" = __import__("threading").Lock() def __new__(cls) -> "RuntimeState": with cls._lock: if cls._instance is None: cls._instance = super().__new__(cls) cls._instance._initialized = False return cls._instance def __init__(self) -> None: if self._initialized: return self._runtime_manager: Optional[Any] = None self._gateway_process: Optional[subprocess.Popen] = None self._gateway_port: int = 8765 self._state_lock = asyncio.Lock() self._initialized = True @property async def lock(self) -> asyncio.Lock: """Get the asyncio lock for state synchronization.""" return self._state_lock @property def runtime_manager(self) -> Optional[Any]: """Get the runtime manager (no lock - read only).""" return self._runtime_manager @runtime_manager.setter def runtime_manager(self, value: Optional[Any]) -> None: """Set the runtime manager.""" self._runtime_manager = value @property def gateway_process(self) -> Optional[subprocess.Popen]: """Get the gateway process (no lock - read only).""" return self._gateway_process @gateway_process.setter def gateway_process(self, value: Optional[subprocess.Popen]) -> None: """Set the gateway process.""" self._gateway_process = value @property def gateway_port(self) -> int: """Get the gateway port.""" return self._gateway_port @gateway_port.setter def gateway_port(self, value: int) -> None: """Set the gateway port.""" self._gateway_port = value async def set_runtime_manager(self, manager: Any) -> None: """Set runtime manager with lock protection.""" async with self._state_lock: self._runtime_manager = manager async def get_runtime_manager(self) -> Optional[Any]: """Get runtime manager with lock protection.""" async with self._state_lock: return self._runtime_manager async def set_gateway_process(self, process: Optional[subprocess.Popen]) -> None: """Set gateway process with lock protection.""" async with self._state_lock: self._gateway_process = process async def get_gateway_process(self) -> Optional[subprocess.Popen]: """Get gateway process with lock protection.""" async with self._state_lock: return self._gateway_process async def set_gateway_port(self, port: int) -> None: """Set gateway port with lock protection.""" async with self._state_lock: self._gateway_port = port async def get_gateway_port(self) -> int: """Get gateway port with lock protection.""" async with self._state_lock: return self._gateway_port # Singleton instance _runtime_state = RuntimeState() def get_runtime_state() -> RuntimeState: """Get the RuntimeState singleton instance.""" return _runtime_state # Backward compatibility: module-level runtime_manager for external imports # This is set by register_runtime_manager() for backward compatibility runtime_manager: Optional[Any] = None class RunContextResponse(BaseModel): config_name: str run_dir: str bootstrap_values: Dict[str, Any] class RuntimeAgentState(BaseModel): agent_id: str display_name: Optional[str] = None status: str last_session: Optional[str] = None last_updated: str class RuntimeAgentsResponse(BaseModel): agents: List[RuntimeAgentState] class RuntimeEvent(BaseModel): timestamp: str event: str details: Dict[str, Any] session: Optional[str] class RuntimeEventsResponse(BaseModel): events: List[RuntimeEvent] class LaunchConfig(BaseModel): """Configuration for launching a new trading task.""" launch_mode: str = Field(default="fresh", description="启动形式: fresh, restore") restore_run_id: Optional[str] = Field(default=None, description="历史任务 run_id,用于恢复启动") tickers: List[str] = Field(default_factory=list, description="股票池") schedule_mode: str = Field(default="daily", description="调度模式: daily, interval") interval_minutes: int = Field(default=60, ge=1, description="间隔分钟数") trigger_time: str = Field(default="09:30", description="触发时间 HH:MM") max_comm_cycles: int = Field(default=2, ge=1, description="最大会商轮数") initial_cash: float = Field(default=100000.0, gt=0, description="初始资金") margin_requirement: float = Field(default=0.0, ge=0, description="保证金要求") enable_memory: bool = Field(default=False, description="是否启用长期记忆") mode: str = Field(default="live", description="运行模式: live, backtest") start_date: Optional[str] = Field(default=None, description="回测开始日期 YYYY-MM-DD") end_date: Optional[str] = Field(default=None, description="回测结束日期 YYYY-MM-DD") poll_interval: int = Field(default=10, ge=1, le=300, description="市场数据轮询间隔(秒)") class LaunchResponse(BaseModel): run_id: str status: str run_dir: str gateway_port: int message: str class RuntimeHistoryItem(BaseModel): run_id: str run_dir: str updated_at: Optional[str] = None total_trades: int = 0 total_asset_value: Optional[float] = None bootstrap: Dict[str, Any] = Field(default_factory=dict) class RuntimeHistoryResponse(BaseModel): runs: List[RuntimeHistoryItem] class StopResponse(BaseModel): status: str message: str class CleanupResponse(BaseModel): status: str kept: int pruned_run_ids: List[str] class GatewayStatusResponse(BaseModel): is_running: bool port: int run_id: Optional[str] = None process_status: Optional[str] = None pid: Optional[int] = None class GatewayHealthResponse(BaseModel): status: str checks: Dict[str, Any] timestamp: str class RuntimeModeResponse(BaseModel): mode: str is_backtest: bool run_id: Optional[str] = None schedule_mode: Optional[str] = None is_running: bool class RuntimeConfigResponse(BaseModel): run_id: str is_running: bool gateway_port: int bootstrap: Dict[str, Any] resolved: Dict[str, Any] class RuntimeLogResponse(BaseModel): run_id: Optional[str] = None is_running: bool log_path: Optional[str] = None content: str = "" class UpdateRuntimeConfigRequest(BaseModel): schedule_mode: Optional[str] = None interval_minutes: Optional[int] = Field(default=None, ge=1) trigger_time: Optional[str] = None max_comm_cycles: Optional[int] = Field(default=None, ge=1) initial_cash: Optional[float] = Field(default=None, gt=0) margin_requirement: Optional[float] = Field(default=None, ge=0) enable_memory: Optional[bool] = None def _generate_run_id() -> str: """Generate timestamp-based run ID: YYYYMMDD_HHMMSS""" return datetime.now().strftime("%Y%m%d_%H%M%S") def _get_run_dir(run_id: str) -> Path: """Return the run directory for a given run ID.""" return PROJECT_ROOT / "runs" / run_id def _load_run_snapshot(run_id: str) -> Dict[str, Any]: """Load a specific run snapshot by run_id.""" snapshot_path = _get_run_dir(run_id) / "state" / "runtime_state.json" if not snapshot_path.exists(): raise HTTPException(status_code=404, detail=f"Run snapshot not found: {run_id}") return json.loads(snapshot_path.read_text(encoding="utf-8")) def _load_run_server_state(run_dir: Path) -> Dict[str, Any]: """Load persisted runtime server state if present.""" server_state_path = run_dir / "state" / "server_state.json" if not server_state_path.exists(): return {} try: return json.loads(server_state_path.read_text(encoding="utf-8")) except Exception: return {} def _resolve_runtime_agent_display_name(run_id: str, agent_id: str) -> Optional[str]: """Best-effort display name for one runtime agent. Priority: 1. PROFILE.md line like `角色定位:中文名` 2. PROFILE.md YAML frontmatter field `name` """ asset_dir = PROJECT_ROOT / "runs" / run_id / "agents" / agent_id profile_path = asset_dir / "PROFILE.md" if not profile_path.exists(): return None try: raw = profile_path.read_text(encoding="utf-8").strip() except Exception: return None if not raw: return None frontmatter_name: Optional[str] = None if raw.startswith("---"): parts = raw.split("---", 2) if len(parts) >= 3: try: import yaml parsed = yaml.safe_load(parts[1].strip()) or {} if isinstance(parsed, dict): value = parsed.get("name") if isinstance(value, str) and value.strip(): frontmatter_name = value.strip() except Exception: pass raw = parts[2].strip() for line in raw.splitlines(): normalized = line.strip() if normalized.startswith("角色定位:"): value = normalized.split(":", 1)[1].strip() if value: return value if normalized.lower().startswith("role:"): value = normalized.split(":", 1)[1].strip() if value: return value return frontmatter_name def _enrich_runtime_agents(run_id: Optional[str], agents: List[Dict[str, Any]]) -> List[Dict[str, Any]]: if not run_id: return agents enriched: List[Dict[str, Any]] = [] for item in agents: payload = dict(item) display_name = payload.get("display_name") agent_id = str(payload.get("agent_id") or "").strip() if agent_id and (not isinstance(display_name, str) or not display_name.strip()): payload["display_name"] = _resolve_runtime_agent_display_name(run_id, agent_id) enriched.append(payload) return enriched def _extract_history_metrics(run_dir: Path) -> tuple[int, Optional[float]]: """Prefer runtime state files over dashboard exports for history summaries.""" server_state = _load_run_server_state(run_dir) portfolio = server_state.get("portfolio") or {} trades = server_state.get("trades") total_trades = len(trades) if isinstance(trades, list) else 0 total_asset_value = None if portfolio.get("total_value") is not None: try: total_asset_value = float(portfolio.get("total_value")) except (TypeError, ValueError): total_asset_value = None if total_trades or total_asset_value is not None: return total_trades, total_asset_value summary_path = run_dir / "team_dashboard" / "summary.json" if not summary_path.exists(): return 0, None try: summary = json.loads(summary_path.read_text(encoding="utf-8")) total_trades = int(summary.get("totalTrades") or 0) total_asset_value = ( float(summary.get("totalAssetValue")) if summary.get("totalAssetValue") is not None else None ) return total_trades, total_asset_value except Exception: return 0, None def _copy_path_if_exists(src: Path, dst: Path) -> None: if not src.exists(): return if src.is_dir(): shutil.copytree(src, dst, dirs_exist_ok=True) else: dst.parent.mkdir(parents=True, exist_ok=True) shutil.copy2(src, dst) def _restore_run_assets(source_run_id: str, target_run_dir: Path) -> None: """Seed a fresh run directory from a historical run snapshot.""" source_run_dir = _get_run_dir(source_run_id) if not source_run_dir.exists(): raise HTTPException(status_code=404, detail=f"Source run not found: {source_run_id}") for relative in [ "team_dashboard/_internal_state.json", "agents", "skills", "memory", "state/server_state.json", "state/runtime.db", "state/research.db", ]: _copy_path_if_exists(source_run_dir / relative, target_run_dir / relative) def _list_runs(limit: int = 50) -> list[RuntimeHistoryItem]: runs_root = PROJECT_ROOT / "runs" if not runs_root.exists(): return [] items: list[RuntimeHistoryItem] = [] run_dirs = sorted( [path for path in runs_root.iterdir() if path.is_dir()], key=lambda path: path.stat().st_mtime, reverse=True, ) for run_dir in run_dirs[: max(1, int(limit))]: run_id = run_dir.name runtime_state_path = run_dir / "state" / "runtime_state.json" bootstrap: Dict[str, Any] = {} updated_at: Optional[str] = None total_trades, total_asset_value = _extract_history_metrics(run_dir) if runtime_state_path.exists(): try: snapshot = json.loads(runtime_state_path.read_text(encoding="utf-8")) context = snapshot.get("context") or {} bootstrap = dict(context.get("bootstrap_values") or {}) updated_at = snapshot.get("events", [{}])[-1].get("timestamp") if snapshot.get("events") else None except Exception: bootstrap = {} items.append( RuntimeHistoryItem( run_id=run_id, run_dir=str(run_dir), updated_at=updated_at, total_trades=total_trades, total_asset_value=total_asset_value, bootstrap=bootstrap, ) ) return items def _is_timestamped_run_dir(path: Path) -> bool: try: datetime.strptime(path.name, "%Y%m%d_%H%M%S") return True except ValueError: return False def _prune_old_timestamped_runs(*, keep: int = 20, exclude_run_ids: Optional[set[str]] = None) -> list[str]: """Prune old timestamped run directories, preserving the newest N and excluded ids.""" exclude = exclude_run_ids or set() runs_root = PROJECT_ROOT / "runs" if not runs_root.exists(): return [] candidates = sorted( [ path for path in runs_root.iterdir() if path.is_dir() and _is_timestamped_run_dir(path) and path.name not in exclude ], key=lambda path: path.name, reverse=True, ) pruned: list[str] = [] for path in candidates[max(0, keep):]: shutil.rmtree(path, ignore_errors=True) pruned.append(path.name) return pruned def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int: """Find an available port for Gateway.""" import socket for port in range(start_port, max_port): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: if s.connect_ex(('localhost', port)) != 0: return port raise RuntimeError("No available port found") def _is_gateway_running() -> bool: """Check if Gateway process is running. Checks both the internally-managed gateway process and falls back to port availability (for externally-managed gateway processes). The fallback matters because this codebase may still encounter two startup shapes while historical artifacts remain in-tree: 1. runtime_service-managed Gateway subprocesses 2. externally started historical Gateway processes outside the supported dev flow """ process = _runtime_state.gateway_process if process is not None and process.poll() is None: return True # Fallback: check if the gateway port is in use (for externally started gateway) import socket try: with socket.create_connection(("127.0.0.1", _runtime_state.gateway_port), timeout=1): return True except OSError: return False def _stop_gateway() -> bool: """Stop the Gateway process.""" process = _runtime_state.gateway_process if process is None: return False try: # Try graceful shutdown first process.terminate() try: process.wait(timeout=5) except subprocess.TimeoutExpired: # Force kill if graceful shutdown fails process.kill() process.wait() except Exception as e: logger.warning(f"Error during gateway shutdown: {e}") finally: _runtime_state.gateway_process = None return True def _start_gateway_process( run_id: str, run_dir: Path, bootstrap: Dict[str, Any], port: int ) -> subprocess.Popen: """Start Gateway as a runtime_service-managed subprocess. This path is used when runtime lifecycle is driven through the runtime API. It is not the only supported way a Gateway may exist in the current repo. """ # Validate configuration before starting validation_errors = _validate_gateway_config(bootstrap) if validation_errors: raise HTTPException( status_code=400, detail=f"Gateway configuration validation failed: {'; '.join(validation_errors)}" ) # Prepare environment env = os.environ.copy() # Create command arguments cmd = [ sys.executable, "-m", "backend.gateway_server", "--run-id", run_id, "--run-dir", str(run_dir), "--port", str(port), "--bootstrap", json.dumps(bootstrap) ] log_path = run_dir / "logs" / "gateway.log" log_path.parent.mkdir(parents=True, exist_ok=True) log_file = log_path.open("ab") try: process = subprocess.Popen( cmd, env=env, stdout=log_file, stderr=subprocess.STDOUT, cwd=PROJECT_ROOT ) finally: log_file.close() return process def _validate_gateway_config(bootstrap: Dict[str, Any]) -> List[str]: """Validate Gateway bootstrap configuration. Returns a list of validation error messages. Empty list means valid. """ errors: List[str] = [] # Check required environment variables based on mode mode = bootstrap.get("mode", "live") is_backtest = mode == "backtest" # Validate mode if mode not in ("live", "backtest"): errors.append(f"Invalid mode '{mode}': must be 'live' or 'backtest'") # Check API keys based on mode if not is_backtest: # Live mode requires FINNHUB_API_KEY finnhub_key = os.getenv("FINNHUB_API_KEY") if not finnhub_key: errors.append("FINNHUB_API_KEY environment variable is required for live mode") # Check LLM configuration model_name = os.getenv("MODEL_NAME") openai_key = os.getenv("OPENAI_API_KEY") dashscope_key = os.getenv("DASHSCOPE_API_KEY") if not model_name: errors.append("MODEL_NAME environment variable is not set") if not openai_key and not dashscope_key: errors.append("Either OPENAI_API_KEY or DASHSCOPE_API_KEY environment variable must be set") # Validate tickers tickers = bootstrap.get("tickers", []) if not tickers: errors.append("No tickers specified in configuration") elif not isinstance(tickers, list): errors.append("Tickers must be a list") # Validate numeric values try: initial_cash = float(bootstrap.get("initial_cash", 0)) if initial_cash <= 0: errors.append("initial_cash must be greater than 0") except (TypeError, ValueError): errors.append("initial_cash must be a valid number") try: margin_requirement = float(bootstrap.get("margin_requirement", 0)) if margin_requirement < 0 or margin_requirement > 1: errors.append("margin_requirement must be between 0 and 1") except (TypeError, ValueError): errors.append("margin_requirement must be a valid number") # Validate backtest dates if is_backtest: start_date = bootstrap.get("start_date") end_date = bootstrap.get("end_date") if not start_date: errors.append("start_date is required for backtest mode") if not end_date: errors.append("end_date is required for backtest mode") if start_date and end_date: try: from datetime import datetime start = datetime.strptime(start_date, "%Y-%m-%d") end = datetime.strptime(end_date, "%Y-%m-%d") if start >= end: errors.append("start_date must be before end_date") except ValueError: errors.append("Dates must be in YYYY-MM-DD format") # Validate schedule mode schedule_mode = _normalize_schedule_mode(bootstrap.get("schedule_mode", "daily")) if schedule_mode not in ("daily", "interval"): errors.append(f"Invalid schedule_mode '{schedule_mode}': must be 'daily' or 'interval'") return errors def _get_gateway_process_details() -> Dict[str, Any]: """Get detailed information about the Gateway process.""" process = _runtime_state.gateway_process details = { "pid": None, "status": "not_running", "returncode": None, } if process is None: return details details["pid"] = process.pid returncode = process.poll() if returncode is None: details["status"] = "running" details["returncode"] = None else: details["status"] = "exited" details["returncode"] = returncode return details def _check_gateway_health() -> Dict[str, Any]: """Perform comprehensive health checks on Gateway.""" checks = { "process": {"status": "unknown", "details": {}}, "port": {"status": "unknown", "details": {}}, "configuration": {"status": "unknown", "details": {}}, } # Check process status process_details = _get_gateway_process_details() checks["process"]["details"] = process_details if process_details["status"] == "running": checks["process"]["status"] = "healthy" elif process_details["status"] == "exited": checks["process"]["status"] = "unhealthy" checks["process"]["details"]["error"] = f"Process exited with code {process_details['returncode']}" else: checks["process"]["status"] = "unknown" # Check port connectivity import socket port = _runtime_state.gateway_port try: with socket.create_connection(("127.0.0.1", port), timeout=2): checks["port"]["status"] = "healthy" checks["port"]["details"] = {"port": port, "accessible": True} except OSError as e: checks["port"]["status"] = "unhealthy" checks["port"]["details"] = {"port": port, "accessible": False, "error": str(e)} # Check configuration try: if _runtime_state.runtime_manager is not None: checks["configuration"]["status"] = "healthy" checks["configuration"]["details"]["has_runtime_manager"] = True else: checks["configuration"]["status"] = "degraded" checks["configuration"]["details"]["has_runtime_manager"] = False except Exception as e: checks["configuration"]["status"] = "unknown" checks["configuration"]["details"]["error"] = str(e) # Determine overall status statuses = [c["status"] for c in checks.values()] if any(s == "unhealthy" for s in statuses): overall_status = "unhealthy" elif all(s == "healthy" for s in statuses): overall_status = "healthy" else: overall_status = "degraded" return { "status": overall_status, "checks": checks, "timestamp": datetime.now().isoformat(), } @router.get("/context", response_model=RunContextResponse) async def get_run_context() -> RunContextResponse: """Return active runtime context, or latest persisted context when stopped.""" snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot() context = snapshot.get("context") if context is None: raise HTTPException(status_code=404, detail="Run context is not ready") return RunContextResponse( config_name=context["config_name"], run_dir=context["run_dir"], bootstrap_values=context["bootstrap_values"], ) @router.get("/agents", response_model=RuntimeAgentsResponse) async def get_runtime_agents() -> RuntimeAgentsResponse: """Return agent states from the active runtime, or latest persisted run.""" snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot() run_id = snapshot.get("context", {}).get("config_name") agents = _enrich_runtime_agents(run_id, snapshot.get("agents", [])) return RuntimeAgentsResponse( agents=[RuntimeAgentState(**a) for a in agents] ) @router.get("/events", response_model=RuntimeEventsResponse) async def get_runtime_events() -> RuntimeEventsResponse: """Return events from the active runtime, or latest persisted run.""" snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot() events = snapshot.get("events", []) return RuntimeEventsResponse( events=[RuntimeEvent(**e) for e in events] ) @router.get("/history", response_model=RuntimeHistoryResponse) async def get_runtime_history(limit: int = 20) -> RuntimeHistoryResponse: """List recent historical runs for restore/start selection.""" return RuntimeHistoryResponse(runs=_list_runs(limit=limit)) @router.get("/gateway/status", response_model=GatewayStatusResponse) async def get_gateway_status() -> GatewayStatusResponse: """Get Gateway process status and port with detailed process information.""" is_running = _is_gateway_running() run_id = None process_details = _get_gateway_process_details() if is_running: try: run_id = _get_active_runtime_context().get("config_name") except Exception as e: logger.warning(f"Failed to resolve active runtime context: {e}") return GatewayStatusResponse( is_running=is_running, port=_runtime_state.gateway_port, run_id=run_id, process_status=process_details["status"], pid=process_details["pid"], ) @router.get("/gateway/health", response_model=GatewayHealthResponse) async def get_gateway_health() -> GatewayHealthResponse: """Get comprehensive Gateway health check including process, port, and configuration status.""" health = _check_gateway_health() return GatewayHealthResponse(**health) @router.get("/mode", response_model=RuntimeModeResponse) async def get_runtime_mode() -> RuntimeModeResponse: """Get current runtime mode (live or backtest) and related configuration.""" is_running = _is_gateway_running() if not is_running: return RuntimeModeResponse( mode="stopped", is_backtest=False, run_id=None, schedule_mode=None, is_running=False, ) try: context = _get_active_runtime_context() bootstrap = context.get("bootstrap_values", {}) mode = bootstrap.get("mode", "live") return RuntimeModeResponse( mode=mode, is_backtest=mode == "backtest", run_id=context.get("config_name"), schedule_mode=_normalize_schedule_mode(bootstrap.get("schedule_mode")), is_running=True, ) except HTTPException: return RuntimeModeResponse( mode="unknown", is_backtest=False, run_id=None, schedule_mode=None, is_running=False, ) @router.get("/gateway/port") async def get_gateway_port(request: Request) -> Dict[str, Any]: """Get WebSocket Gateway port for frontend connection.""" gateway_port = _runtime_state.gateway_port return { "port": gateway_port, "is_running": _is_gateway_running(), "ws_url": _build_gateway_ws_url(request, gateway_port), } @router.get("/logs", response_model=RuntimeLogResponse) async def get_runtime_logs() -> RuntimeLogResponse: """Return current runtime log tail, or the latest run log if runtime is stopped.""" try: context = _get_active_runtime_context() if _is_gateway_running() else _get_runtime_context_from_latest_snapshot() except HTTPException: return RuntimeLogResponse(is_running=False, content="") run_id = str(context.get("config_name") or "").strip() or None log_path = _get_gateway_log_path_for_run(run_id) if run_id else None content = _read_log_tail(log_path) if log_path else "" return RuntimeLogResponse( run_id=run_id, is_running=_is_gateway_running(), log_path=str(log_path) if log_path else None, content=content, ) def _build_gateway_ws_url(request: Request, port: int) -> str: """Build a proxy-safe Gateway WebSocket URL.""" forwarded_proto = request.headers.get("x-forwarded-proto", "").split(",")[0].strip() scheme = forwarded_proto or request.url.scheme ws_scheme = "wss" if scheme == "https" else "ws" forwarded_host = request.headers.get("x-forwarded-host", "").split(",")[0].strip() host = forwarded_host or request.url.hostname or "localhost" if ":" in host and not host.startswith("["): host = host.split(":", 1)[0] return f"{ws_scheme}://{host}:{port}" def _load_latest_runtime_snapshot() -> Dict[str, Any]: """Load the latest persisted runtime snapshot.""" snapshots = sorted( PROJECT_ROOT.glob("runs/*/state/runtime_state.json"), key=lambda p: p.stat().st_mtime, reverse=True, ) if not snapshots: raise HTTPException(status_code=404, detail="No runtime information available") return json.loads(snapshots[0].read_text(encoding="utf-8")) def _get_active_runtime_snapshot() -> Dict[str, Any]: """Return the active runtime snapshot. For a running Gateway, the canonical runtime source of truth is the run-scoped snapshot file under `runs//state/runtime_state.json`, because the Gateway subprocess mutates it directly while the parent runtime_service process may still hold a stale in-memory manager snapshot. """ if not _is_gateway_running(): raise HTTPException(status_code=404, detail="No runtime is currently running") manager = _runtime_state.runtime_manager if manager is not None: run_id = str(getattr(manager, "config_name", "") or "").strip() if run_id: snapshot_path = _get_run_dir(run_id) / "state" / "runtime_state.json" if snapshot_path.exists(): return json.loads(snapshot_path.read_text(encoding="utf-8")) if manager is not None and hasattr(manager, "build_snapshot"): snapshot = manager.build_snapshot() context = snapshot.get("context") or {} if context.get("config_name"): return snapshot return _load_latest_runtime_snapshot() def _get_runtime_context_from_latest_snapshot() -> Dict[str, Any]: """Return the latest persisted runtime context regardless of active process state.""" latest = _load_latest_runtime_snapshot() context = latest.get("context") or {} if not context.get("config_name"): raise HTTPException(status_code=404, detail="No runtime context available") return context def _get_gateway_log_path_for_run(run_id: str) -> Path: return _get_run_dir(run_id) / "logs" / "gateway.log" def _read_log_tail(path: Path, max_chars: int = 120_000) -> str: if not path.exists() or not path.is_file(): return "" text = path.read_text(encoding="utf-8", errors="replace") text = _sanitize_runtime_log_text(text) if len(text) <= max_chars: return text return text[-max_chars:] def _sanitize_runtime_log_text(text: str) -> str: if not text: return "" # Drop repetitive development-only warnings for unsandboxed skill execution. text = re.sub( r"(?:^|\n)=+\n" r"⚠️\s+\[安全警告\]\s+技能在无沙盒模式下运行\s+\(SKILL_SANDBOX_MODE=none\)\n" r"\s+技能脚本将直接在当前进程中执行,无隔离保护。\n" r"\s+建议:生产环境请设置\s+SKILL_SANDBOX_MODE=docker\n" r"=+\n?", "\n", text, flags=re.MULTILINE, ) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def _get_current_runtime_context() -> Dict[str, Any]: """Return the active runtime context from the latest snapshot.""" if not _is_gateway_running(): raise HTTPException(status_code=404, detail="No runtime is currently running") snapshot = _get_active_runtime_snapshot() context = snapshot.get("context") or {} if not context.get("config_name"): raise HTTPException(status_code=404, detail="No runtime context available") return context def _get_active_runtime_context() -> Dict[str, Any]: """Return the active runtime context, preferring in-memory runtime manager state.""" return _get_current_runtime_context() def _resolve_runtime_response(run_id: str) -> RuntimeConfigResponse: """Build a normalized runtime config response for the active run.""" context = _get_current_runtime_context() bootstrap = dict(context.get("bootstrap_values") or {}) resolved = resolve_runtime_config( project_root=PROJECT_ROOT, config_name=run_id, enable_memory=bool(bootstrap.get("enable_memory", False)), schedule_mode=_normalize_schedule_mode(bootstrap.get("schedule_mode", "daily")), interval_minutes=int(bootstrap.get("interval_minutes", 60) or 60), trigger_time=str(bootstrap.get("trigger_time", "09:30") or "09:30"), ) return RuntimeConfigResponse( run_id=run_id, is_running=True, gateway_port=_runtime_state.gateway_port, bootstrap=bootstrap, resolved=resolved, ) def _normalize_runtime_config_updates( request: UpdateRuntimeConfigRequest, ) -> Dict[str, Any]: """Validate and normalize runtime config updates.""" updates: Dict[str, Any] = {} if request.schedule_mode is not None: schedule_mode = _normalize_schedule_mode(request.schedule_mode) if schedule_mode not in {"daily", "interval"}: raise HTTPException( status_code=400, detail="schedule_mode must be 'daily' or 'interval'", ) updates["schedule_mode"] = schedule_mode if request.interval_minutes is not None: updates["interval_minutes"] = int(request.interval_minutes) if request.trigger_time is not None: trigger_time = str(request.trigger_time).strip() if trigger_time and trigger_time != "now": try: datetime.strptime(trigger_time, "%H:%M") except ValueError as exc: raise HTTPException( status_code=400, detail="trigger_time must use HH:MM or 'now'", ) from exc updates["trigger_time"] = trigger_time or "09:30" if request.max_comm_cycles is not None: updates["max_comm_cycles"] = int(request.max_comm_cycles) if request.initial_cash is not None: updates["initial_cash"] = float(request.initial_cash) if request.margin_requirement is not None: updates["margin_requirement"] = float(request.margin_requirement) if request.enable_memory is not None: updates["enable_memory"] = bool(request.enable_memory) if not updates: raise HTTPException(status_code=400, detail="No runtime config updates provided") return updates @router.post("/start", response_model=LaunchResponse) async def start_runtime( config: LaunchConfig, background_tasks: BackgroundTasks ) -> LaunchResponse: """Start a new trading runtime with the given configuration. 1. Stop existing Gateway if running 2. Generate run ID and directory 3. Create runtime manager 4. Start Gateway as subprocess (Data Plane) 5. Return Gateway port for WebSocket connection """ # Lazy import to avoid circular dependency from backend.runtime.manager import TradingRuntimeManager # 1. Stop existing Gateway if _is_gateway_running(): _stop_gateway() await asyncio.sleep(1) # Wait for port release launch_mode = str(config.launch_mode or "fresh").strip().lower() if launch_mode not in {"fresh", "restore"}: raise HTTPException(status_code=400, detail="launch_mode must be 'fresh' or 'restore'") # 2. Resolve run ID, directory, and bootstrap if launch_mode == "restore": restore_run_id = str(config.restore_run_id or "").strip() if not restore_run_id: raise HTTPException(status_code=400, detail="restore_run_id is required when launch_mode=restore") snapshot = _load_run_snapshot(restore_run_id) context = snapshot.get("context") or {} if not context.get("config_name"): raise HTTPException(status_code=404, detail=f"Run context not found: {restore_run_id}") run_id = restore_run_id run_dir = _get_run_dir(run_id) bootstrap = dict(context.get("bootstrap_values") or {}) bootstrap["launch_mode"] = "restore" bootstrap["restore_run_id"] = restore_run_id else: run_id = _generate_run_id() run_dir = _get_run_dir(run_id) bootstrap = { "launch_mode": "fresh", "restore_run_id": None, "tickers": config.tickers, "schedule_mode": config.schedule_mode, "interval_minutes": config.interval_minutes, "trigger_time": config.trigger_time, "max_comm_cycles": config.max_comm_cycles, "initial_cash": config.initial_cash, "margin_requirement": config.margin_requirement, "enable_memory": config.enable_memory, "mode": config.mode, "start_date": config.start_date, "end_date": config.end_date, "poll_interval": config.poll_interval, } retention_keep = max(1, int(os.getenv("RUNS_RETENTION_COUNT", "20") or "20")) pruned_run_ids = _prune_old_timestamped_runs( keep=retention_keep, exclude_run_ids={run_id}, ) if pruned_run_ids: logger.info("Pruned old run directories: %s", ", ".join(pruned_run_ids)) # 4. Create runtime manager manager = TradingRuntimeManager( config_name=run_id, run_dir=run_dir, bootstrap=bootstrap, ) manager.prepare_run() register_runtime_manager(manager) # 5. Write BOOTSTRAP.md _write_bootstrap_md(run_dir, bootstrap) # 6. Find available port and start Gateway process gateway_port = _find_available_port(start_port=8765) _runtime_state.gateway_port = gateway_port try: process = _start_gateway_process( run_id=run_id, run_dir=run_dir, bootstrap=bootstrap, port=gateway_port ) _runtime_state.gateway_process = process # Wait briefly to check if process started successfully await asyncio.sleep(2) if not _is_gateway_running(): _runtime_state.gateway_process = None log_path = _get_gateway_log_path_for_run(run_id) log_tail = _read_log_tail(log_path, max_chars=4000) # Build detailed error message error_details = [] error_details.append(f"Gateway process exited unexpectedly") process_details = _get_gateway_process_details() if process_details.get("returncode") is not None: error_details.append(f"Exit code: {process_details['returncode']}") if log_tail: error_details.append(f"Recent log output:\n{log_tail}") else: error_details.append("No log output available. Check environment configuration.") # Check common configuration issues config_errors = _validate_gateway_config(bootstrap) if config_errors: error_details.append(f"Configuration issues detected: {'; '.join(config_errors)}") raise HTTPException( status_code=500, detail="\n".join(error_details) ) except HTTPException: raise except Exception as e: _stop_gateway() raise HTTPException( status_code=500, detail=f"Failed to start Gateway: {type(e).__name__}: {str(e)}" ) return LaunchResponse( run_id=run_id, status="started", run_dir=str(run_dir), gateway_port=gateway_port, message=f"Runtime started with run_id: {run_id}, Gateway on port: {gateway_port}", ) @router.get("/config", response_model=RuntimeConfigResponse) async def get_runtime_config() -> RuntimeConfigResponse: """Return the current runtime bootstrap and resolved settings.""" context = _get_current_runtime_context() return _resolve_runtime_response(context["config_name"]) @router.put("/config", response_model=RuntimeConfigResponse) async def update_runtime_config( request: UpdateRuntimeConfigRequest, ) -> RuntimeConfigResponse: """Persist selected runtime configuration updates for the active run.""" context = _get_current_runtime_context() run_id = context["config_name"] updates = _normalize_runtime_config_updates(request) updated = update_bootstrap_values_for_run(PROJECT_ROOT, run_id, updates) manager = _runtime_state.runtime_manager if manager is not None and getattr(manager, "config_name", None) == run_id: manager.bootstrap.update(updates) if getattr(manager, "context", None) is not None: manager.context.bootstrap_values.update(updates) if hasattr(manager, "_persist_snapshot"): manager._persist_snapshot() response = _resolve_runtime_response(run_id) response.bootstrap = dict(updated.values) return response @router.post("/stop", response_model=StopResponse) async def stop_runtime(force: bool = True) -> StopResponse: """Stop the current running runtime.""" was_running = _is_gateway_running() if not was_running: process_details = _get_gateway_process_details() if process_details["status"] == "exited": # Process exited but we have a record of it raise HTTPException( status_code=404, detail=( f"No runtime is currently running. " f"Previous Gateway process exited with code {process_details['returncode']}. " f"PID: {process_details['pid']}" ) ) raise HTTPException(status_code=404, detail="No runtime is currently running") # Get process details before stopping for the response process_details = _get_gateway_process_details() pid_info = f" (PID: {process_details.get('pid')})" if process_details.get('pid') else "" # Stop Gateway process stop_success = _stop_gateway() if not stop_success: raise HTTPException( status_code=500, detail=f"Failed to stop Gateway process{pid_info}. Process may have already terminated." ) # Unregister runtime manager unregister_runtime_manager() return StopResponse( status="stopped", message=f"Runtime stopped successfully{pid_info}", ) @router.post("/cleanup", response_model=CleanupResponse) async def cleanup_old_runs(keep: int = 20) -> CleanupResponse: """Prune old timestamped run directories while preserving named runs.""" keep_count = max(1, int(keep)) exclude: set[str] = set() if _is_gateway_running(): try: active_context = _get_active_runtime_context() active_run_id = str(active_context.get("config_name") or "").strip() if active_run_id: exclude.add(active_run_id) except HTTPException: pass pruned = _prune_old_timestamped_runs(keep=keep_count, exclude_run_ids=exclude) return CleanupResponse(status="ok", kept=keep_count, pruned_run_ids=pruned) @router.post("/restart") async def restart_runtime( config: LaunchConfig, background_tasks: BackgroundTasks ): """Restart the runtime with a new configuration.""" # Stop current runtime await stop_runtime(force=True) # Start new runtime response = await start_runtime(config, background_tasks) return { "run_id": response.run_id, "status": "restarted", "gateway_port": response.gateway_port, "message": f"Runtime restarted with run_id: {response.run_id}", } @router.get("/current") async def get_current_runtime(): """Get information about the currently running runtime.""" if not _is_gateway_running(): raise HTTPException(status_code=404, detail="No runtime is currently running") context = _get_active_runtime_context() return { "run_id": context.get("config_name"), "run_dir": context.get("run_dir"), "is_running": True, "gateway_port": _runtime_state.gateway_port, "bootstrap": context.get("bootstrap_values", {}), } def register_runtime_manager(manager: Any) -> None: """Allow other modules to expose the runtime manager to the API.""" global runtime_manager runtime_manager = manager # Also update the RuntimeState singleton for internal consistency _runtime_state.runtime_manager = manager def unregister_runtime_manager() -> None: """Drop the runtime manager reference.""" global runtime_manager runtime_manager = None # Also update the RuntimeState singleton for internal consistency _runtime_state.runtime_manager = None def _write_bootstrap_md(run_dir: Path, bootstrap: Dict[str, Any]) -> None: """Write bootstrap configuration to BOOTSTRAP.md.""" try: import yaml except ImportError: yaml = None bootstrap_path = run_dir / "BOOTSTRAP.md" bootstrap_path.parent.mkdir(parents=True, exist_ok=True) # Filter out None values values = {k: v for k, v in bootstrap.items() if v is not None} if yaml: front_matter = yaml.safe_dump(values, allow_unicode=True, sort_keys=False) else: front_matter = json.dumps(values, ensure_ascii=False, indent=2) content = f"---\n{front_matter}---\n" bootstrap_path.write_text(content, encoding="utf-8")