feat: Refactor services architecture and update project structure

- Remove Docker-based microservices (docker-compose.yml, Makefile, Dockerfiles) - Update start-dev.sh to use backend.app:app entry point - Add shared schema and client modules for service communication - Add team coordination modules (messenger, registry, task_delegator, coordinator) - Add evaluation hooks and skill adaptation hooks - Add skill template and gateway server - Update frontend WebSocket URL configuration - Add explain components for insider and technical analysis Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:57:09 +08:00
parent 4b5ac86b83
commit 5b925fbe02
27 changed files with 4213 additions and 1 deletions
--- a/backend/agents/base/evaluation_hook.py
+++ b/backend/agents/base/evaluation_hook.py
@@ -0,0 +1,452 @@
+# -*- coding: utf-8 -*-
+"""Evaluation hooks system for skills.
+
+Provides evaluation metric collection and storage for skill performance tracking.
+Based on the evaluation hooks design in SKILL_TEMPLATE.md.
+"""
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+
+class MetricType(Enum):
+    """Types of evaluation metrics."""
+    HIT_RATE = "hit_rate"           # 信号命中率
+    RISK_VIOLATION = "risk_violation"  # 风控违例率
+    POSITION_DEVIATION = "position_deviation"  # 仓位偏离率
+    PnL_ATTRIBUTION = "pnl_attribution"  # P&L 归因一致性
+    SIGNAL_CONSISTENCY = "signal_consistency"  # 信号一致性
+    DECISION_LATENCY = "decision_latency"  # 决策延迟
+    TOOL_USAGE = "tool_usage"  # 工具使用率
+    CUSTOM = "custom"  # 自定义指标
+
+
+@dataclass
+class EvaluationMetric:
+    """A single evaluation metric."""
+    name: str
+    metric_type: MetricType
+    value: float
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "name": self.name,
+            "metric_type": self.metric_type.value,
+            "value": self.value,
+            "timestamp": self.timestamp,
+            "metadata": self.metadata,
+        }
+
+
+@dataclass
+class EvaluationResult:
+    """Evaluation result for a skill execution."""
+    skill_name: str
+    run_id: str
+    agent_id: str
+    metrics: List[EvaluationMetric] = field(default_factory=list)
+    inputs: Dict[str, Any] = field(default_factory=dict)
+    outputs: Dict[str, Any] = field(default_factory=dict)
+    decision: Optional[str] = None
+    success: bool = True
+    error_message: Optional[str] = None
+    started_at: Optional[str] = None
+    completed_at: Optional[str] = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "skill_name": self.skill_name,
+            "run_id": self.run_id,
+            "agent_id": self.agent_id,
+            "metrics": [m.to_dict() for m in self.metrics],
+            "inputs": self.inputs,
+            "outputs": self.outputs,
+            "decision": self.decision,
+            "success": self.success,
+            "error_message": self.error_message,
+            "started_at": self.started_at,
+            "completed_at": self.completed_at,
+        }
+
+
+class EvaluationHook:
+    """Hook for collecting skill evaluation metrics.
+
+    This hook collects and stores evaluation metrics after skill execution
+    for later analysis and memory/reflection stages.
+    """
+
+    def __init__(
+        self,
+        storage_dir: Path,
+        run_id: str,
+        agent_id: str,
+    ):
+        """Initialize evaluation hook.
+
+        Args:
+            storage_dir: Directory to store evaluation results
+            run_id: Current run identifier
+            agent_id: Current agent identifier
+        """
+        self.storage_dir = Path(storage_dir)
+        self.run_id = run_id
+        self.agent_id = agent_id
+        self._current_evaluation: Optional[EvaluationResult] = None
+
+    def start_evaluation(
+        self,
+        skill_name: str,
+        inputs: Dict[str, Any],
+    ) -> None:
+        """Start a new evaluation session.
+
+        Args:
+            skill_name: Name of the skill being evaluated
+            inputs: Input parameters for the skill
+        """
+        self._current_evaluation = EvaluationResult(
+            skill_name=skill_name,
+            run_id=self.run_id,
+            agent_id=self.agent_id,
+            inputs=inputs,
+            started_at=datetime.now().isoformat(),
+        )
+        logger.debug(f"Started evaluation for skill: {skill_name}")
+
+    def add_metric(
+        self,
+        name: str,
+        metric_type: MetricType,
+        value: float,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Add an evaluation metric.
+
+        Args:
+            name: Metric name
+            metric_type: Type of metric
+            value: Metric value
+            metadata: Additional metadata
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring metric")
+            return
+
+        metric = EvaluationMetric(
+            name=name,
+            metric_type=metric_type,
+            value=value,
+            metadata=metadata or {},
+        )
+        self._current_evaluation.metrics.append(metric)
+        logger.debug(f"Added metric: {name} = {value}")
+
+    def add_metrics(self, metrics: List[EvaluationMetric]) -> None:
+        """Add multiple evaluation metrics at once.
+
+        Args:
+            metrics: List of metrics to add
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring metrics")
+            return
+
+        self._current_evaluation.metrics.extend(metrics)
+
+    def record_outputs(self, outputs: Dict[str, Any]) -> None:
+        """Record skill outputs.
+
+        Args:
+            outputs: Output from skill execution
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring outputs")
+            return
+
+        self._current_evaluation.outputs = outputs
+
+    def record_decision(self, decision: str) -> None:
+        """Record the final decision.
+
+        Args:
+            decision: Final decision made by the skill
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring decision")
+            return
+
+        self._current_evaluation.decision = decision
+
+    def complete_evaluation(
+        self,
+        success: bool = True,
+        error_message: Optional[str] = None,
+    ) -> Optional[EvaluationResult]:
+        """Complete the evaluation session and persist results.
+
+        Args:
+            success: Whether the skill execution was successful
+            error_message: Error message if failed
+
+        Returns:
+            The completed evaluation result, or None if no active evaluation
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation to complete")
+            return None
+
+        self._current_evaluation.success = success
+        self._current_evaluation.error_message = error_message
+        self._current_evaluation.completed_at = datetime.now().isoformat()
+
+        # Persist to storage
+        result = self._persist_evaluation(self._current_evaluation)
+
+        self._current_evaluation = None
+        logger.debug(f"Completed evaluation for skill: {result.skill_name}")
+
+        return result
+
+    def _persist_evaluation(self, evaluation: EvaluationResult) -> EvaluationResult:
+        """Persist evaluation result to storage.
+
+        Args:
+            evaluation: Evaluation result to persist
+
+        Returns:
+            The persisted evaluation
+        """
+        # Create run-specific directory
+        run_dir = self.storage_dir / self.run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create agent-specific subdirectory
+        agent_dir = run_dir / self.agent_id
+        agent_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        filename = f"{evaluation.skill_name}_{timestamp}.json"
+        filepath = agent_dir / filename
+
+        # Write evaluation result
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                json.dump(evaluation.to_dict(), f, ensure_ascii=False, indent=2)
+            logger.info(f"Persisted evaluation to: {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to persist evaluation: {e}")
+
+        return evaluation
+
+    def cancel_evaluation(self) -> None:
+        """Cancel the current evaluation session without saving."""
+        if self._current_evaluation is not None:
+            logger.debug(f"Cancelled evaluation for: {self._current_evaluation.skill_name}")
+            self._current_evaluation = None
+
+
+class EvaluationCollector:
+    """Collector for aggregating evaluation metrics across runs.
+
+    Provides methods to query and analyze evaluation results.
+    """
+
+    def __init__(self, storage_dir: Path):
+        """Initialize evaluation collector.
+
+        Args:
+            storage_dir: Root directory containing evaluation results
+        """
+        self.storage_dir = Path(storage_dir)
+
+    def get_run_evaluations(
+        self,
+        run_id: str,
+        agent_id: Optional[str] = None,
+    ) -> List[EvaluationResult]:
+        """Get all evaluations for a run.
+
+        Args:
+            run_id: Run identifier
+            agent_id: Optional agent identifier to filter by
+
+        Returns:
+            List of evaluation results
+        """
+        run_dir = self.storage_dir / run_id
+        if not run_dir.exists():
+            return []
+
+        evaluations = []
+
+        agent_dirs = [run_dir / agent_id] if agent_id else run_dir.iterdir()
+
+        for agent_dir in agent_dirs:
+            if not agent_dir.is_dir():
+                continue
+
+            for eval_file in agent_dir.glob("*.json"):
+                try:
+                    with open(eval_file, "r", encoding="utf-8") as f:
+                        data = json.load(f)
+                        evaluations.append(self._parse_evaluation(data))
+                except Exception as e:
+                    logger.warning(f"Failed to load evaluation {eval_file}: {e}")
+
+        return evaluations
+
+    def get_skill_metrics(
+        self,
+        skill_name: str,
+        run_ids: Optional[List[str]] = None,
+    ) -> List[EvaluationMetric]:
+        """Get all metrics for a specific skill.
+
+        Args:
+            skill_name: Name of the skill
+            run_ids: Optional list of run IDs to filter by
+
+        Returns:
+            List of metrics for the skill
+        """
+        metrics = []
+
+        if run_ids is None:
+            run_ids = [d.name for d in self.storage_dir.iterdir() if d.is_dir()]
+
+        for run_id in run_ids:
+            evaluations = self.get_run_evaluations(run_id)
+            for eval_result in evaluations:
+                if eval_result.skill_name == skill_name:
+                    metrics.extend(eval_result.metrics)
+
+        return metrics
+
+    def calculate_skill_stats(
+        self,
+        skill_name: str,
+        metric_type: MetricType,
+        run_ids: Optional[List[str]] = None,
+    ) -> Dict[str, float]:
+        """Calculate statistics for a specific metric type.
+
+        Args:
+            skill_name: Name of the skill
+            metric_type: Type of metric to calculate
+            run_ids: Optional list of run IDs to filter by
+
+        Returns:
+            Dictionary with min, max, avg, count statistics
+        """
+        metrics = self.get_skill_metrics(skill_name, run_ids)
+        filtered = [m for m in metrics if m.metric_type == metric_type]
+
+        if not filtered:
+            return {"count": 0}
+
+        values = [m.value for m in filtered]
+        return {
+            "count": len(values),
+            "min": min(values),
+            "max": max(values),
+            "avg": sum(values) / len(values),
+        }
+
+    def _parse_evaluation(self, data: Dict[str, Any]) -> EvaluationResult:
+        """Parse evaluation data into EvaluationResult.
+
+        Args:
+            data: Raw evaluation data
+
+        Returns:
+            Parsed EvaluationResult
+        """
+        metrics = []
+        for m in data.get("metrics", []):
+            metrics.append(EvaluationMetric(
+                name=m["name"],
+                metric_type=MetricType(m["metric_type"]),
+                value=m["value"],
+                timestamp=m.get("timestamp", ""),
+                metadata=m.get("metadata", {}),
+            ))
+
+        return EvaluationResult(
+            skill_name=data["skill_name"],
+            run_id=data["run_id"],
+            agent_id=data["agent_id"],
+            metrics=metrics,
+            inputs=data.get("inputs", {}),
+            outputs=data.get("outputs", {}),
+            decision=data.get("decision"),
+            success=data.get("success", True),
+            error_message=data.get("error_message"),
+            started_at=data.get("started_at"),
+            completed_at=data.get("completed_at"),
+        )
+
+
+def parse_evaluation_hooks(skill_dir: Path) -> Dict[str, Any]:
+    """Parse evaluation hooks from SKILL.md.
+
+    Extracts the Optional: Evaluation hooks section from skill documentation.
+
+    Args:
+        skill_dir: Skill directory path
+
+    Returns:
+        Dictionary containing evaluation hook definitions
+    """
+    skill_md = skill_dir / "SKILL.md"
+    if not skill_md.exists():
+        return {}
+
+    try:
+        content = skill_md.read_text(encoding="utf-8")
+
+        # Extract evaluation hooks section
+        if "## Optional: Evaluation hooks" in content:
+            start = content.find("## Optional: Evaluation hooks")
+            # Find the next ## section or end of file
+            next_section = content.find("\n## ", start + 1)
+            if next_section == -1:
+                eval_section = content[start:]
+            else:
+                eval_section = content[start:next_section]
+
+            # Parse metrics from the section
+            metrics = []
+            for metric_type in MetricType:
+                if metric_type.value.replace("_", " ") in eval_section.lower():
+                    metrics.append(metric_type.value)
+
+            return {
+                "supported_metrics": metrics,
+                "section_content": eval_section.strip(),
+            }
+    except Exception as e:
+        logger.warning(f"Failed to parse evaluation hooks: {e}")
+
+    return {}
+
+
+__all__ = [
+    "MetricType",
+    "EvaluationMetric",
+    "EvaluationResult",
+    "EvaluationHook",
+    "EvaluationCollector",
+    "parse_evaluation_hooks",
+]
--- a/backend/agents/base/skill_adaptation_hook.py
+++ b/backend/agents/base/skill_adaptation_hook.py
@@ -0,0 +1,489 @@
+# -*- coding: utf-8 -*-
+"""Skill adaptation hook for automatic evaluation-to-iteration闭环.
+
+Monitors evaluation metrics against configurable thresholds and triggers
+automatic skill reload or logs warnings when thresholds are breached.
+"""
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+from .evaluation_hook import (
+    EvaluationCollector,
+    EvaluationResult,
+    MetricType,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class AdaptationAction(Enum):
+    """Actions to take when threshold is breached."""
+    RELOAD = "reload"           # 自动重新加载技能
+    WARN = "warn"               # 记录警告供人工审核
+    BOTH = "both"               # 同时执行重载和警告
+    NONE = "none"               # 不做任何操作
+
+
+@dataclass
+class AdaptationThreshold:
+    """Threshold configuration for a metric."""
+    metric_type: MetricType
+    operator: str = "lt"        # lt (less than), gt (greater than), lte, gte, eq
+    value: float = 0.0
+    window_size: int = 10       # 移动窗口大小，用于计算滑动平均
+    min_samples: int = 5        # 最少样本数才触发检查
+    action: AdaptationAction = AdaptationAction.WARN
+    cooldown_seconds: int = 300  # 触发后的冷却时间
+
+    def evaluate(self, current_value: float) -> bool:
+        """Evaluate if threshold is breached."""
+        ops = {
+            "lt": lambda x, y: x < y,
+            "lte": lambda x, y: x <= y,
+            "gt": lambda x, y: x > y,
+            "gte": lambda x, y: x >= y,
+            "eq": lambda x, y: x == y,
+        }
+        op_func = ops.get(self.operator)
+        if op_func is None:
+            logger.warning(f"Unknown operator: {self.operator}")
+            return False
+        return op_func(current_value, self.value)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "metric_type": self.metric_type.value,
+            "operator": self.operator,
+            "value": self.value,
+            "window_size": self.window_size,
+            "min_samples": self.min_samples,
+            "action": self.action.value,
+            "cooldown_seconds": self.cooldown_seconds,
+        }
+
+
+@dataclass
+class AdaptationEvent:
+    """Record of an adaptation trigger event."""
+    timestamp: str
+    skill_name: str
+    metric_type: MetricType
+    threshold: AdaptationThreshold
+    current_value: float
+    avg_value: float
+    action_taken: AdaptationAction
+    details: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "timestamp": self.timestamp,
+            "skill_name": self.skill_name,
+            "metric_type": self.metric_type.value,
+            "threshold": self.threshold.to_dict(),
+            "current_value": self.current_value,
+            "avg_value": self.avg_value,
+            "action_taken": self.action_taken.value,
+            "details": self.details,
+        }
+
+
+class SkillAdaptationHook:
+    """Hook for monitoring evaluation metrics and triggering skill adaptation.
+
+    This hook wraps EvaluationHook to add threshold-based adaptation logic.
+    When metrics breach configured thresholds, it can:
+    - Automatically reload skills via SkillsManager
+    - Log warnings for human review
+    - Both
+    """
+
+    # Default thresholds for common metrics
+    DEFAULT_THRESHOLDS: List[AdaptationThreshold] = [
+        AdaptationThreshold(
+            metric_type=MetricType.HIT_RATE,
+            operator="lt",
+            value=0.5,
+            action=AdaptationAction.WARN,
+            cooldown_seconds=600,
+        ),
+        AdaptationThreshold(
+            metric_type=MetricType.RISK_VIOLATION,
+            operator="gt",
+            value=0.1,
+            action=AdaptationAction.WARN,
+            cooldown_seconds=300,
+        ),
+        AdaptationThreshold(
+            metric_type=MetricType.DECISION_LATENCY,
+            operator="gt",
+            value=5000,  # 5 seconds
+            action=AdaptationAction.WARN,
+            cooldown_seconds=300,
+        ),
+    ]
+
+    def __init__(
+        self,
+        storage_dir: Path,
+        run_id: str,
+        agent_id: str,
+        thresholds: Optional[List[AdaptationThreshold]] = None,
+        collector: Optional[EvaluationCollector] = None,
+    ):
+        """Initialize skill adaptation hook.
+
+        Args:
+            storage_dir: Directory to store adaptation events
+            run_id: Current run identifier
+            agent_id: Current agent identifier
+            thresholds: Custom threshold configurations (uses defaults if None)
+            collector: Optional EvaluationCollector for historical data
+        """
+        self.storage_dir = Path(storage_dir)
+        self.run_id = run_id
+        self.agent_id = agent_id
+        self.thresholds = thresholds or self.DEFAULT_THRESHOLDS
+        self.collector = collector or EvaluationCollector(storage_dir)
+
+        # Track cooldowns to prevent rapid re-triggering
+        self._cooldowns: Dict[str, datetime] = {}
+
+        # Store recent metrics in memory for quick access
+        self._recent_metrics: Dict[str, List[float]] = {}
+
+        # Pending adaptation events
+        self._pending_events: List[AdaptationEvent] = []
+
+    def check_threshold(
+        self,
+        skill_name: str,
+        metric_type: MetricType,
+        current_value: float,
+    ) -> Optional[AdaptationEvent]:
+        """Check if a metric breaches any threshold.
+
+        Args:
+            skill_name: Name of the skill
+            metric_type: Type of metric
+            current_value: Current metric value
+
+        Returns:
+            AdaptationEvent if threshold breached, None otherwise
+        """
+        # Find applicable thresholds
+        applicable_thresholds = [
+            t for t in self.thresholds
+            if t.metric_type == metric_type
+        ]
+
+        if not applicable_thresholds:
+            return None
+
+        # Check cooldown
+        cooldown_key = f"{skill_name}:{metric_type.value}"
+        now = datetime.now()
+        last_trigger = self._cooldowns.get(cooldown_key)
+
+        # Store current value first for avg calculation
+        self._store_metric(cooldown_key, current_value)
+
+        for threshold in applicable_thresholds:
+            if last_trigger:
+                elapsed = (now - last_trigger).total_seconds()
+                if elapsed < threshold.cooldown_seconds:
+                    continue
+
+            # Evaluate threshold
+            if threshold.evaluate(current_value):
+                # Calculate moving average
+                avg_value = self._calculate_avg(skill_name, metric_type, current_value)
+
+                # Check minimum samples (allow immediate trigger if min_samples <= 1)
+                sample_count = len(self._recent_metrics.get(cooldown_key, []))
+                if threshold.min_samples > 1 and sample_count < threshold.min_samples:
+                    # Not enough samples yet
+                    continue
+
+                # Trigger adaptation
+                event = AdaptationEvent(
+                    timestamp=now.isoformat(),
+                    skill_name=skill_name,
+                    metric_type=metric_type,
+                    threshold=threshold,
+                    current_value=current_value,
+                    avg_value=avg_value,
+                    action_taken=threshold.action,
+                    details={
+                        "run_id": self.run_id,
+                        "agent_id": self.agent_id,
+                    },
+                )
+
+                # Update cooldown
+                self._cooldowns[cooldown_key] = now
+
+                # Persist event
+                self._persist_event(event)
+
+                logger.info(
+                    f"Threshold breached for {skill_name}.{metric_type.value}: "
+                    f"current={current_value}, avg={avg_value}, action={threshold.action.value}"
+                )
+
+                return event
+
+        return None
+
+    def _calculate_avg(
+        self,
+        skill_name: str,
+        metric_type: MetricType,
+        current_value: float,
+    ) -> float:
+        """Calculate moving average for a metric."""
+        key = f"{skill_name}:{metric_type.value}"
+        values = self._recent_metrics.get(key, [])
+        if not values:
+            return current_value
+        return sum(values) / len(values)
+
+    def _store_metric(self, key: str, value: float) -> None:
+        """Store metric value with sliding window."""
+        if key not in self._recent_metrics:
+            self._recent_metrics[key] = []
+        self._recent_metrics[key].append(value)
+        # Keep only last 100 values
+        if len(self._recent_metrics[key]) > 100:
+            self._recent_metrics[key] = self._recent_metrics[key][-100:]
+
+    def _persist_event(self, event: AdaptationEvent) -> None:
+        """Persist adaptation event to storage."""
+        run_dir = self.storage_dir / self.run_id / "adaptations"
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        filename = f"{event.skill_name}_{event.metric_type.value}_{timestamp}.json"
+        filepath = run_dir / filename
+
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                json.dump(event.to_dict(), f, ensure_ascii=False, indent=2)
+            logger.debug(f"Persisted adaptation event to: {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to persist adaptation event: {e}")
+
+        # Also add to pending list
+        self._pending_events.append(event)
+
+    def get_pending_warnings(self) -> List[AdaptationEvent]:
+        """Get all pending warning events that need human review."""
+        return [
+            e for e in self._pending_events
+            if e.action_taken in (AdaptationAction.WARN, AdaptationAction.BOTH)
+        ]
+
+    def clear_pending_warnings(self) -> None:
+        """Clear pending warnings after they have been reviewed."""
+        self._pending_events = [
+            e for e in self._pending_events
+            if e.action_taken == AdaptationAction.RELOAD
+        ]
+
+    def get_recent_events(
+        self,
+        skill_name: Optional[str] = None,
+        metric_type: Optional[MetricType] = None,
+        limit: int = 50,
+    ) -> List[AdaptationEvent]:
+        """Get recent adaptation events.
+
+        Args:
+            skill_name: Optional filter by skill name
+            metric_type: Optional filter by metric type
+            limit: Maximum number of events to return
+
+        Returns:
+            List of recent adaptation events
+        """
+        events_dir = self.storage_dir / self.run_id / "adaptations"
+        if not events_dir.exists():
+            return []
+
+        events = []
+        for eval_file in sorted(events_dir.glob("*.json"), reverse=True)[:limit]:
+            try:
+                with open(eval_file, "r", encoding="utf-8") as f:
+                    data = json.load(f)
+                    event = self._parse_event(data)
+                    if skill_name and event.skill_name != skill_name:
+                        continue
+                    if metric_type and event.metric_type != metric_type:
+                        continue
+                    events.append(event)
+            except Exception as e:
+                logger.warning(f"Failed to load adaptation event {eval_file}: {e}")
+
+        return events
+
+    def _parse_event(self, data: Dict[str, Any]) -> AdaptationEvent:
+        """Parse adaptation event from JSON data."""
+        threshold_data = data.get("threshold", {})
+        metric_type = MetricType(threshold_data.get("metric_type", "custom"))
+
+        threshold = AdaptationThreshold(
+            metric_type=metric_type,
+            operator=threshold_data.get("operator", "lt"),
+            value=threshold_data.get("value", 0.0),
+            window_size=threshold_data.get("window_size", 10),
+            min_samples=threshold_data.get("min_samples", 5),
+            action=AdaptationAction(threshold_data.get("action", "warn")),
+            cooldown_seconds=threshold_data.get("cooldown_seconds", 300),
+        )
+
+        return AdaptationEvent(
+            timestamp=data.get("timestamp", ""),
+            skill_name=data.get("skill_name", ""),
+            metric_type=metric_type,
+            threshold=threshold,
+            current_value=data.get("current_value", 0.0),
+            avg_value=data.get("avg_value", 0.0),
+            action_taken=AdaptationAction(data.get("action_taken", "warn")),
+            details=data.get("details", {}),
+        )
+
+    def add_threshold(self, threshold: AdaptationThreshold) -> None:
+        """Add a new threshold configuration."""
+        self.thresholds.append(threshold)
+
+    def remove_threshold(self, metric_type: MetricType) -> None:
+        """Remove all thresholds for a specific metric type."""
+        self.thresholds = [
+            t for t in self.thresholds
+            if t.metric_type != metric_type
+        ]
+
+    def update_threshold(
+        self,
+        metric_type: MetricType,
+        **kwargs,
+    ) -> None:
+        """Update threshold configuration for a metric type."""
+        for threshold in self.thresholds:
+            if threshold.metric_type == metric_type:
+                for key, value in kwargs.items():
+                    if hasattr(threshold, key):
+                        setattr(threshold, key, value)
+
+    def get_thresholds(self) -> List[AdaptationThreshold]:
+        """Get current threshold configurations."""
+        return list(self.thresholds)
+
+    def is_in_cooldown(self, skill_name: str, metric_type: MetricType) -> bool:
+        """Check if a skill/metric combination is in cooldown period."""
+        key = f"{skill_name}:{metric_type.value}"
+        last_trigger = self._cooldowns.get(key)
+        if not last_trigger:
+            return False
+
+        # Find the threshold for this metric type
+        for threshold in self.thresholds:
+            if threshold.metric_type == metric_type:
+                elapsed = (datetime.now() - last_trigger).total_seconds()
+                return elapsed < threshold.cooldown_seconds
+
+        return False
+
+
+class AdaptationManager:
+    """Manager for coordinating skill adaptation across multiple agents.
+
+    Provides centralized tracking of adaptation events and skill reloads.
+    """
+
+    def __init__(self, storage_dir: Path):
+        """Initialize adaptation manager.
+
+        Args:
+            storage_dir: Root directory for storing adaptation data
+        """
+        self.storage_dir = Path(storage_dir)
+        self._hooks: Dict[str, SkillAdaptationHook] = {}
+
+    def get_hook(
+        self,
+        run_id: str,
+        agent_id: str,
+        thresholds: Optional[List[AdaptationThreshold]] = None,
+    ) -> SkillAdaptationHook:
+        """Get or create an adaptation hook for an agent.
+
+        Args:
+            run_id: Run identifier
+            agent_id: Agent identifier
+            thresholds: Optional custom thresholds
+
+        Returns:
+            SkillAdaptationHook instance
+        """
+        key = f"{run_id}:{agent_id}"
+        if key not in self._hooks:
+            self._hooks[key] = SkillAdaptationHook(
+                storage_dir=self.storage_dir,
+                run_id=run_id,
+                agent_id=agent_id,
+                thresholds=thresholds,
+            )
+        return self._hooks[key]
+
+    def get_all_pending_warnings(self) -> List[AdaptationEvent]:
+        """Get all pending warnings from all hooks."""
+        warnings = []
+        for hook in self._hooks.values():
+            warnings.extend(hook.get_pending_warnings())
+        return warnings
+
+    def get_run_adaptations(self, run_id: str) -> List[AdaptationEvent]:
+        """Get all adaptation events for a run."""
+        events = []
+        for hook in self._hooks.values():
+            if hook.run_id == run_id:
+                events.extend(hook.get_recent_events())
+        return events
+
+
+# Global manager instance
+_adaptation_manager: Optional[AdaptationManager] = None
+
+
+def get_adaptation_manager(storage_dir: Optional[Path] = None) -> AdaptationManager:
+    """Get global adaptation manager instance.
+
+    Args:
+        storage_dir: Optional storage directory (required on first call)
+
+    Returns:
+        AdaptationManager instance
+    """
+    global _adaptation_manager
+    if _adaptation_manager is None:
+        if storage_dir is None:
+            raise ValueError("storage_dir required on first initialization")
+        _adaptation_manager = AdaptationManager(storage_dir)
+    return _adaptation_manager
+
+
+__all__ = [
+    "AdaptationAction",
+    "AdaptationThreshold",
+    "AdaptationEvent",
+    "SkillAdaptationHook",
+    "AdaptationManager",
+    "get_adaptation_manager",
+]