Refine runtime data flow and UI layering

2026-03-24 15:00:35 +08:00
parent c5eaf2b5ad
commit 6413edf8c9
17 changed files with 373 additions and 114 deletions
--- a/backend/api/runtime.py
+++ b/backend/api/runtime.py
@@ -8,6 +8,7 @@ import json
 import logging
 import os
 import signal
+import shutil
 import subprocess
 import sys
 from datetime import datetime
@@ -194,6 +195,12 @@ class StopResponse(BaseModel):
    message: str


+class CleanupResponse(BaseModel):
+    status: str
+    kept: int
+    pruned_run_ids: List[str]
+
+
 class GatewayStatusResponse(BaseModel):
    is_running: bool
    port: int
@@ -235,6 +242,38 @@ def _get_run_dir(run_id: str) -> Path:
    return PROJECT_ROOT / "runs" / run_id


+def _is_timestamped_run_dir(path: Path) -> bool:
+    try:
+        datetime.strptime(path.name, "%Y%m%d_%H%M%S")
+        return True
+    except ValueError:
+        return False
+
+
+def _prune_old_timestamped_runs(*, keep: int = 20, exclude_run_ids: Optional[set[str]] = None) -> list[str]:
+    """Prune old timestamped run directories, preserving the newest N and excluded ids."""
+    exclude = exclude_run_ids or set()
+    runs_root = PROJECT_ROOT / "runs"
+    if not runs_root.exists():
+        return []
+
+    candidates = sorted(
+        [
+            path
+            for path in runs_root.iterdir()
+            if path.is_dir() and _is_timestamped_run_dir(path) and path.name not in exclude
+        ],
+        key=lambda path: path.name,
+        reverse=True,
+    )
+
+    pruned: list[str] = []
+    for path in candidates[max(0, keep):]:
+        shutil.rmtree(path, ignore_errors=True)
+        pruned.append(path.name)
+    return pruned
+
+
 def _find_available_port(start_port: int = 8765, max_port: int = 9000) -> int:
    """Find an available port for Gateway."""
    import socket
@@ -316,15 +355,9 @@ def _start_gateway_process(

@router.get("/context", response_model=RunContextResponse)
 async def get_run_context() -> RunContextResponse:
-    """Return the most recent run context."""
-    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
-    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
-
-    if not snapshots:
-        raise HTTPException(status_code=404, detail="No run context available")
-
-    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
-    context = latest.get("context")
+    """Return active runtime context, or latest persisted context when stopped."""
+    snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
+    context = snapshot.get("context")
    if context is None:
        raise HTTPException(status_code=404, detail="Run context is not ready")

@@ -337,15 +370,9 @@ async def get_run_context() -> RunContextResponse:

@router.get("/agents", response_model=RuntimeAgentsResponse)
 async def get_runtime_agents() -> RuntimeAgentsResponse:
-    """Return agent states from the most recent run."""
-    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
-    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
-
-    if not snapshots:
-        raise HTTPException(status_code=404, detail="No runtime state available")
-
-    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
-    agents = latest.get("agents", [])
+    """Return agent states from the active runtime, or latest persisted run."""
+    snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
+    agents = snapshot.get("agents", [])

    return RuntimeAgentsResponse(
        agents=[RuntimeAgentState(**a) for a in agents]
@@ -354,15 +381,9 @@ async def get_runtime_agents() -> RuntimeAgentsResponse:

@router.get("/events", response_model=RuntimeEventsResponse)
 async def get_runtime_events() -> RuntimeEventsResponse:
-    """Return events from the most recent run."""
-    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
-    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
-
-    if not snapshots:
-        raise HTTPException(status_code=404, detail="No runtime state available")
-
-    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
-    events = latest.get("events", [])
+    """Return events from the active runtime, or latest persisted run."""
+    snapshot = _get_active_runtime_snapshot() if _is_gateway_running() else _load_latest_runtime_snapshot()
+    events = snapshot.get("events", [])

    return RuntimeEventsResponse(
        events=[RuntimeEvent(**e) for e in events]
@@ -376,15 +397,10 @@ async def get_gateway_status() -> GatewayStatusResponse:
    run_id = None

    if is_running:
-        # Try to find run_id from runtime state
-        snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
-        snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
-        if snapshots:
-            try:
-                latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
-                run_id = latest.get("context", {}).get("config_name")
-            except Exception as e:
-                logger.warning(f"Failed to parse latest snapshot: {e}")
+        try:
+            run_id = _get_active_runtime_context().get("config_name")
+        except Exception as e:
+            logger.warning(f"Failed to resolve active runtime context: {e}")

    return GatewayStatusResponse(
        is_running=is_running,
@@ -408,7 +424,7 @@ async def get_gateway_port(request: Request) -> Dict[str, Any]:
 async def get_runtime_logs() -> RuntimeLogResponse:
    """Return current runtime log tail, or the latest run log if runtime is stopped."""
    try:
-        context = _get_runtime_context_from_latest_snapshot()
+        context = _get_active_runtime_context() if _is_gateway_running() else _get_runtime_context_from_latest_snapshot()
    except HTTPException:
        return RuntimeLogResponse(is_running=False, content="")

@@ -450,6 +466,21 @@ def _load_latest_runtime_snapshot() -> Dict[str, Any]:
    return json.loads(snapshots[0].read_text(encoding="utf-8"))


+def _get_active_runtime_snapshot() -> Dict[str, Any]:
+    """Return the active runtime snapshot, preferring in-memory manager state."""
+    if not _is_gateway_running():
+        raise HTTPException(status_code=404, detail="No runtime is currently running")
+
+    manager = _runtime_state.runtime_manager
+    if manager is not None and hasattr(manager, "build_snapshot"):
+        snapshot = manager.build_snapshot()
+        context = snapshot.get("context") or {}
+        if context.get("config_name"):
+            return snapshot
+
+    return _load_latest_runtime_snapshot()
+
+
 def _get_runtime_context_from_latest_snapshot() -> Dict[str, Any]:
    """Return the latest persisted runtime context regardless of active process state."""
    latest = _load_latest_runtime_snapshot()
@@ -476,7 +507,16 @@ def _get_current_runtime_context() -> Dict[str, Any]:
    """Return the active runtime context from the latest snapshot."""
    if not _is_gateway_running():
        raise HTTPException(status_code=404, detail="No runtime is currently running")
-    return _get_runtime_context_from_latest_snapshot()
+    snapshot = _get_active_runtime_snapshot()
+    context = snapshot.get("context") or {}
+    if not context.get("config_name"):
+        raise HTTPException(status_code=404, detail="No runtime context available")
+    return context
+
+
+def _get_active_runtime_context() -> Dict[str, Any]:
+    """Return the active runtime context, preferring in-memory runtime manager state."""
+    return _get_current_runtime_context()


 def _resolve_runtime_response(run_id: str) -> RuntimeConfigResponse:
@@ -573,6 +613,14 @@ async def start_runtime(
    run_id = _generate_run_id()
    run_dir = _get_run_dir(run_id)

+    retention_keep = max(1, int(os.getenv("RUNS_RETENTION_COUNT", "20") or "20"))
+    pruned_run_ids = _prune_old_timestamped_runs(
+        keep=retention_keep,
+        exclude_run_ids={run_id},
+    )
+    if pruned_run_ids:
+        logger.info("Pruned old run directories: %s", ", ".join(pruned_run_ids))
+
    # 3. Prepare bootstrap config
    bootstrap = {
        "tickers": config.tickers,
@@ -690,6 +738,25 @@ async def stop_runtime(force: bool = True) -> StopResponse:
    )


+@router.post("/cleanup", response_model=CleanupResponse)
+async def cleanup_old_runs(keep: int = 20) -> CleanupResponse:
+    """Prune old timestamped run directories while preserving named runs."""
+    keep_count = max(1, int(keep))
+    exclude: set[str] = set()
+
+    if _is_gateway_running():
+        try:
+            active_context = _get_active_runtime_context()
+            active_run_id = str(active_context.get("config_name") or "").strip()
+            if active_run_id:
+                exclude.add(active_run_id)
+        except HTTPException:
+            pass
+
+    pruned = _prune_old_timestamped_runs(keep=keep_count, exclude_run_ids=exclude)
+    return CleanupResponse(status="ok", kept=keep_count, pruned_run_ids=pruned)
+
+
@router.post("/restart")
 async def restart_runtime(
    config: LaunchConfig,
@@ -716,15 +783,7 @@ async def get_current_runtime():
    if not _is_gateway_running():
        raise HTTPException(status_code=404, detail="No runtime is currently running")

-    # Find latest runtime state
-    snapshot_path = PROJECT_ROOT.glob("runs/*/state/runtime_state.json")
-    snapshots = sorted(snapshot_path, key=lambda p: p.stat().st_mtime, reverse=True)
-
-    if not snapshots:
-        raise HTTPException(status_code=404, detail="No runtime information available")
-
-    latest = json.loads(snapshots[0].read_text(encoding="utf-8"))
-    context = latest.get("context", {})
+    context = _get_active_runtime_context()

    return {
        "run_id": context.get("config_name"),