Refine runtime data flow and UI layering

This commit is contained in:
2026-03-24 15:00:35 +08:00
parent c5eaf2b5ad
commit 6413edf8c9
17 changed files with 373 additions and 114 deletions

View File

@@ -8,6 +8,7 @@ import json
import logging
import os
import signal
import shutil
import subprocess
import sys
from datetime import datetime
@@ -194,6 +195,12 @@ class StopResponse(BaseModel):
message: str
class CleanupResponse(BaseModel):
status: str
kept: int
pruned_run_ids: List[str]
class GatewayStatusResponse(BaseModel):
is_running: bool
port: int
@@ -235,6 +242,38 @@ def _get_run_dir(run_id: str) -> Path:
return PROJECT_ROOT / "runs" / run_id
def _is_timestamped_run_dir(path: Path) -> bool:
try:
datetime.strptime(path.name, "%Y%m%d_%H%M%S")
return True
except ValueError:
return False
def _prune_old_timestamped_runs(*, keep: int = 20, exclude_run_ids: Optional[set[str]] = None) -> list[str]:
"""Prune old timestamped run directories, preserving the newest N and excluded ids."""
exclude = exclude_run_ids or set()
runs_root = PROJECT_ROOT / "runs"
if not runs_root.exists():
return []
candidates = sorted(
[
path
for path in runs_root.iterdir()
if path.is_dir() and _is_timestamped_run_dir(path) and path.name not in exclude
],
key=lambda path: path.name,
reverse=True,
)
pruned: list[str] = []
for path in candidates[max(0, keep):]:
shutil.rmtree(path, ignore_errors=True)
pruned.append(path.name)
return pruned
def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int:
"""Find an available port for Gateway."""
import socket
@@ -316,15 +355,9 @@ def _start_gateway_process(
@router.get("/context", response_model=RunContextResponse)
async def get_run_context() -> RunContextResponse:
"""Return the most recent run context."""
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No run context available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
context = latest.get("context")
"""Return active runtime context, or latest persisted context when stopped."""
snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
context = snapshot.get("context")
if context is None:
raise HTTPException(status_code=404, detail="Run context is not ready")
@@ -337,15 +370,9 @@ async def get_run_context() -> RunContextResponse:
@router.get("/agents", response_model=RuntimeAgentsResponse)
async def get_runtime_agents() -> RuntimeAgentsResponse:
"""Return agent states from the most recent run."""
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime state available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
agents = latest.get("agents", [])
"""Return agent states from the active runtime, or latest persisted run."""
snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
agents = snapshot.get("agents", [])
return RuntimeAgentsResponse(
agents=[RuntimeAgentState(**a) for a in agents]
@@ -354,15 +381,9 @@ async def get_runtime_agents() -> RuntimeAgentsResponse:
@router.get("/events", response_model=RuntimeEventsResponse)
async def get_runtime_events() -> RuntimeEventsResponse:
"""Return events from the most recent run."""
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime state available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
events = latest.get("events", [])
"""Return events from the active runtime, or latest persisted run."""
snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
events = snapshot.get("events", [])
return RuntimeEventsResponse(
events=[RuntimeEvent(**e) for e in events]
@@ -376,15 +397,10 @@ async def get_gateway_status() -> GatewayStatusResponse:
run_id = None
if is_running:
# Try to find run_id from runtime state
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if snapshots:
try:
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
run_id = latest.get("context", {}).get("config_name")
except Exception as e:
logger.warning(f"Failed to parse latest snapshot: {e}")
try:
run_id = _get_active_runtime_context().get("config_name")
except Exception as e:
logger.warning(f"Failed to resolve active runtime context: {e}")
return GatewayStatusResponse(
is_running=is_running,
@@ -408,7 +424,7 @@ async def get_gateway_port(request: Request) -> Dict[str, Any]:
async def get_runtime_logs() -> RuntimeLogResponse:
"""Return current runtime log tail, or the latest run log if runtime is stopped."""
try:
context = _get_runtime_context_from_latest_snapshot()
context = _get_active_runtime_context() if _is_gateway_running() else _get_runtime_context_from_latest_snapshot()
except HTTPException:
return RuntimeLogResponse(is_running=False, content="")
@@ -450,6 +466,21 @@ def _load_latest_runtime_snapshot() -> Dict[str, Any]:
return json.loads(snapshots[0].read_text(encoding="utf-8"))
def _get_active_runtime_snapshot() -> Dict[str, Any]:
"""Return the active runtime snapshot, preferring in-memory manager state."""
if not _is_gateway_running():
raise HTTPException(status_code=404, detail="No runtime is currently running")
manager = _runtime_state.runtime_manager
if manager is not None and hasattr(manager, "build_snapshot"):
snapshot = manager.build_snapshot()
context = snapshot.get("context") or {}
if context.get("config_name"):
return snapshot
return _load_latest_runtime_snapshot()
def _get_runtime_context_from_latest_snapshot() -> Dict[str, Any]:
"""Return the latest persisted runtime context regardless of active process state."""
latest = _load_latest_runtime_snapshot()
@@ -476,7 +507,16 @@ def _get_current_runtime_context() -> Dict[str, Any]:
"""Return the active runtime context from the latest snapshot."""
if not _is_gateway_running():
raise HTTPException(status_code=404, detail="No runtime is currently running")
return _get_runtime_context_from_latest_snapshot()
snapshot = _get_active_runtime_snapshot()
context = snapshot.get("context") or {}
if not context.get("config_name"):
raise HTTPException(status_code=404, detail="No runtime context available")
return context
def _get_active_runtime_context() -> Dict[str, Any]:
"""Return the active runtime context, preferring in-memory runtime manager state."""
return _get_current_runtime_context()
def _resolve_runtime_response(run_id: str) -> RuntimeConfigResponse:
@@ -573,6 +613,14 @@ async def start_runtime(
run_id = _generate_run_id()
run_dir = _get_run_dir(run_id)
retention_keep = max(1, int(os.getenv("RUNS_RETENTION_COUNT", "20") or "20"))
pruned_run_ids = _prune_old_timestamped_runs(
keep=retention_keep,
exclude_run_ids={run_id},
)
if pruned_run_ids:
logger.info("Pruned old run directories: %s", ", ".join(pruned_run_ids))
# 3. Prepare bootstrap config
bootstrap = {
"tickers": config.tickers,
@@ -690,6 +738,25 @@ async def stop_runtime(force: bool = True) -> StopResponse:
)
@router.post("/cleanup", response_model=CleanupResponse)
async def cleanup_old_runs(keep: int = 20) -> CleanupResponse:
"""Prune old timestamped run directories while preserving named runs."""
keep_count = max(1, int(keep))
exclude: set[str] = set()
if _is_gateway_running():
try:
active_context = _get_active_runtime_context()
active_run_id = str(active_context.get("config_name") or "").strip()
if active_run_id:
exclude.add(active_run_id)
except HTTPException:
pass
pruned = _prune_old_timestamped_runs(keep=keep_count, exclude_run_ids=exclude)
return CleanupResponse(status="ok", kept=keep_count, pruned_run_ids=pruned)
@router.post("/restart")
async def restart_runtime(
config: LaunchConfig,
@@ -716,15 +783,7 @@ async def get_current_runtime():
if not _is_gateway_running():
raise HTTPException(status_code=404, detail="No runtime is currently running")
# Find latest runtime state
snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
if not snapshots:
raise HTTPException(status_code=404, detail="No runtime information available")
latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
context = latest.get("context", {})
context = _get_active_runtime_context()
return {
"run_id": context.get("config_name"),