Files
evotraders/backend/agents/base/skill_adaptation_hook.py
cillin 45c3996434 refactor(cleanup): remove legacy agent classes and complete EvoAgent migration
Remove deprecated AnalystAgent, PMAgent, and RiskAgent classes.
All agent creation now goes through UnifiedAgentFactory creating EvoAgent instances.

- Delete backend/agents/analyst.py (169 lines)
- Delete backend/agents/portfolio_manager.py (420 lines)
- Delete backend/agents/risk_manager.py (139 lines)
- Update all imports to use EvoAgent exclusively
- Clean up unused imports across 25 files
- Update tests to work with simplified agent structure

Constraint: EvoAgent is now the single source of truth for all agent roles
Constraint: UnifiedAgentFactory handles runtime agent creation
Rejected: Keep legacy aliases | creates maintenance burden
Confidence: high
Scope-risk: moderate (affects agent instantiation paths)
Directive: All new agent features must be added to EvoAgent, not legacy classes
Not-tested: Kubernetes sandbox executor (marked with TODO)
2026-04-02 10:51:14 +08:00

489 lines
16 KiB
Python

# -*- coding: utf-8 -*-
"""Skill adaptation hook for automatic evaluation-to-iteration闭环.
Monitors evaluation metrics against configurable thresholds and triggers
automatic skill reload or logs warnings when thresholds are breached.
"""
from __future__ import annotations
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
from .evaluation_hook import (
EvaluationCollector,
MetricType,
)
logger = logging.getLogger(__name__)
class AdaptationAction(Enum):
"""Actions to take when threshold is breached."""
RELOAD = "reload" # 自动重新加载技能
WARN = "warn" # 记录警告供人工审核
BOTH = "both" # 同时执行重载和警告
NONE = "none" # 不做任何操作
@dataclass
class AdaptationThreshold:
"""Threshold configuration for a metric."""
metric_type: MetricType
operator: str = "lt" # lt (less than), gt (greater than), lte, gte, eq
value: float = 0.0
window_size: int = 10 # 移动窗口大小,用于计算滑动平均
min_samples: int = 5 # 最少样本数才触发检查
action: AdaptationAction = AdaptationAction.WARN
cooldown_seconds: int = 300 # 触发后的冷却时间
def evaluate(self, current_value: float) -> bool:
"""Evaluate if threshold is breached."""
ops = {
"lt": lambda x, y: x < y,
"lte": lambda x, y: x <= y,
"gt": lambda x, y: x > y,
"gte": lambda x, y: x >= y,
"eq": lambda x, y: x == y,
}
op_func = ops.get(self.operator)
if op_func is None:
logger.warning(f"Unknown operator: {self.operator}")
return False
return op_func(current_value, self.value)
def to_dict(self) -> Dict[str, Any]:
return {
"metric_type": self.metric_type.value,
"operator": self.operator,
"value": self.value,
"window_size": self.window_size,
"min_samples": self.min_samples,
"action": self.action.value,
"cooldown_seconds": self.cooldown_seconds,
}
@dataclass
class AdaptationEvent:
"""Record of an adaptation trigger event."""
timestamp: str
skill_name: str
metric_type: MetricType
threshold: AdaptationThreshold
current_value: float
avg_value: float
action_taken: AdaptationAction
details: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"timestamp": self.timestamp,
"skill_name": self.skill_name,
"metric_type": self.metric_type.value,
"threshold": self.threshold.to_dict(),
"current_value": self.current_value,
"avg_value": self.avg_value,
"action_taken": self.action_taken.value,
"details": self.details,
}
class SkillAdaptationHook:
"""Hook for monitoring evaluation metrics and triggering skill adaptation.
This hook wraps EvaluationHook to add threshold-based adaptation logic.
When metrics breach configured thresholds, it can:
- Automatically reload skills via SkillsManager
- Log warnings for human review
- Both
"""
# Default thresholds for common metrics
DEFAULT_THRESHOLDS: List[AdaptationThreshold] = [
AdaptationThreshold(
metric_type=MetricType.HIT_RATE,
operator="lt",
value=0.5,
action=AdaptationAction.WARN,
cooldown_seconds=600,
),
AdaptationThreshold(
metric_type=MetricType.RISK_VIOLATION,
operator="gt",
value=0.1,
action=AdaptationAction.WARN,
cooldown_seconds=300,
),
AdaptationThreshold(
metric_type=MetricType.DECISION_LATENCY,
operator="gt",
value=5000, # 5 seconds
action=AdaptationAction.WARN,
cooldown_seconds=300,
),
]
def __init__(
self,
storage_dir: Path,
run_id: str,
agent_id: str,
thresholds: Optional[List[AdaptationThreshold]] = None,
collector: Optional[EvaluationCollector] = None,
):
"""Initialize skill adaptation hook.
Args:
storage_dir: Directory to store adaptation events
run_id: Current run identifier
agent_id: Current agent identifier
thresholds: Custom threshold configurations (uses defaults if None)
collector: Optional EvaluationCollector for historical data
"""
self.storage_dir = Path(storage_dir)
self.run_id = run_id
self.agent_id = agent_id
self.thresholds = thresholds or self.DEFAULT_THRESHOLDS
self.collector = collector or EvaluationCollector(storage_dir)
# Track cooldowns to prevent rapid re-triggering
self._cooldowns: Dict[str, datetime] = {}
# Store recent metrics in memory for quick access
self._recent_metrics: Dict[str, List[float]] = {}
# Pending adaptation events
self._pending_events: List[AdaptationEvent] = []
def check_threshold(
self,
skill_name: str,
metric_type: MetricType,
current_value: float,
) -> Optional[AdaptationEvent]:
"""Check if a metric breaches any threshold.
Args:
skill_name: Name of the skill
metric_type: Type of metric
current_value: Current metric value
Returns:
AdaptationEvent if threshold breached, None otherwise
"""
# Find applicable thresholds
applicable_thresholds = [
t for t in self.thresholds
if t.metric_type == metric_type
]
if not applicable_thresholds:
return None
# Check cooldown
cooldown_key = f"{skill_name}:{metric_type.value}"
now = datetime.now()
last_trigger = self._cooldowns.get(cooldown_key)
# Store current value first for avg calculation
self._store_metric(cooldown_key, current_value)
for threshold in applicable_thresholds:
if last_trigger:
elapsed = (now - last_trigger).total_seconds()
if elapsed < threshold.cooldown_seconds:
continue
# Evaluate threshold
if threshold.evaluate(current_value):
# Calculate moving average
avg_value = self._calculate_avg(skill_name, metric_type, current_value)
# Check minimum samples (allow immediate trigger if min_samples <= 1)
sample_count = len(self._recent_metrics.get(cooldown_key, []))
if threshold.min_samples > 1 and sample_count < threshold.min_samples:
# Not enough samples yet
continue
# Trigger adaptation
event = AdaptationEvent(
timestamp=now.isoformat(),
skill_name=skill_name,
metric_type=metric_type,
threshold=threshold,
current_value=current_value,
avg_value=avg_value,
action_taken=threshold.action,
details={
"run_id": self.run_id,
"agent_id": self.agent_id,
},
)
# Update cooldown
self._cooldowns[cooldown_key] = now
# Persist event
self._persist_event(event)
logger.info(
f"Threshold breached for {skill_name}.{metric_type.value}: "
f"current={current_value}, avg={avg_value}, action={threshold.action.value}"
)
return event
return None
def _calculate_avg(
self,
skill_name: str,
metric_type: MetricType,
current_value: float,
) -> float:
"""Calculate moving average for a metric."""
key = f"{skill_name}:{metric_type.value}"
values = self._recent_metrics.get(key, [])
if not values:
return current_value
return sum(values) / len(values)
def _store_metric(self, key: str, value: float) -> None:
"""Store metric value with sliding window."""
if key not in self._recent_metrics:
self._recent_metrics[key] = []
self._recent_metrics[key].append(value)
# Keep only last 100 values
if len(self._recent_metrics[key]) > 100:
self._recent_metrics[key] = self._recent_metrics[key][-100:]
def _persist_event(self, event: AdaptationEvent) -> None:
"""Persist adaptation event to storage."""
run_dir = self.storage_dir / self.run_id / "adaptations"
run_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
filename = f"{event.skill_name}_{event.metric_type.value}_{timestamp}.json"
filepath = run_dir / filename
try:
with open(filepath, "w", encoding="utf-8") as f:
json.dump(event.to_dict(), f, ensure_ascii=False, indent=2)
logger.debug(f"Persisted adaptation event to: {filepath}")
except Exception as e:
logger.error(f"Failed to persist adaptation event: {e}")
# Also add to pending list
self._pending_events.append(event)
def get_pending_warnings(self) -> List[AdaptationEvent]:
"""Get all pending warning events that need human review."""
return [
e for e in self._pending_events
if e.action_taken in (AdaptationAction.WARN, AdaptationAction.BOTH)
]
def clear_pending_warnings(self) -> None:
"""Clear pending warnings after they have been reviewed."""
self._pending_events = [
e for e in self._pending_events
if e.action_taken == AdaptationAction.RELOAD
]
def get_recent_events(
self,
skill_name: Optional[str] = None,
metric_type: Optional[MetricType] = None,
limit: int = 50,
) -> List[AdaptationEvent]:
"""Get recent adaptation events.
Args:
skill_name: Optional filter by skill name
metric_type: Optional filter by metric type
limit: Maximum number of events to return
Returns:
List of recent adaptation events
"""
events_dir = self.storage_dir / self.run_id / "adaptations"
if not events_dir.exists():
return []
events = []
for eval_file in sorted(events_dir.glob("*.json"), reverse=True)[:limit]:
try:
with open(eval_file, "r", encoding="utf-8") as f:
data = json.load(f)
event = self._parse_event(data)
if skill_name and event.skill_name != skill_name:
continue
if metric_type and event.metric_type != metric_type:
continue
events.append(event)
except Exception as e:
logger.warning(f"Failed to load adaptation event {eval_file}: {e}")
return events
def _parse_event(self, data: Dict[str, Any]) -> AdaptationEvent:
"""Parse adaptation event from JSON data."""
threshold_data = data.get("threshold", {})
metric_type = MetricType(threshold_data.get("metric_type", "custom"))
threshold = AdaptationThreshold(
metric_type=metric_type,
operator=threshold_data.get("operator", "lt"),
value=threshold_data.get("value", 0.0),
window_size=threshold_data.get("window_size", 10),
min_samples=threshold_data.get("min_samples", 5),
action=AdaptationAction(threshold_data.get("action", "warn")),
cooldown_seconds=threshold_data.get("cooldown_seconds", 300),
)
return AdaptationEvent(
timestamp=data.get("timestamp", ""),
skill_name=data.get("skill_name", ""),
metric_type=metric_type,
threshold=threshold,
current_value=data.get("current_value", 0.0),
avg_value=data.get("avg_value", 0.0),
action_taken=AdaptationAction(data.get("action_taken", "warn")),
details=data.get("details", {}),
)
def add_threshold(self, threshold: AdaptationThreshold) -> None:
"""Add a new threshold configuration."""
self.thresholds.append(threshold)
def remove_threshold(self, metric_type: MetricType) -> None:
"""Remove all thresholds for a specific metric type."""
self.thresholds = [
t for t in self.thresholds
if t.metric_type != metric_type
]
def update_threshold(
self,
metric_type: MetricType,
**kwargs,
) -> None:
"""Update threshold configuration for a metric type."""
for threshold in self.thresholds:
if threshold.metric_type == metric_type:
for key, value in kwargs.items():
if hasattr(threshold, key):
setattr(threshold, key, value)
def get_thresholds(self) -> List[AdaptationThreshold]:
"""Get current threshold configurations."""
return list(self.thresholds)
def is_in_cooldown(self, skill_name: str, metric_type: MetricType) -> bool:
"""Check if a skill/metric combination is in cooldown period."""
key = f"{skill_name}:{metric_type.value}"
last_trigger = self._cooldowns.get(key)
if not last_trigger:
return False
# Find the threshold for this metric type
for threshold in self.thresholds:
if threshold.metric_type == metric_type:
elapsed = (datetime.now() - last_trigger).total_seconds()
return elapsed < threshold.cooldown_seconds
return False
class AdaptationManager:
"""Manager for coordinating skill adaptation across multiple agents.
Provides centralized tracking of adaptation events and skill reloads.
"""
def __init__(self, storage_dir: Path):
"""Initialize adaptation manager.
Args:
storage_dir: Root directory for storing adaptation data
"""
self.storage_dir = Path(storage_dir)
self._hooks: Dict[str, SkillAdaptationHook] = {}
def get_hook(
self,
run_id: str,
agent_id: str,
thresholds: Optional[List[AdaptationThreshold]] = None,
) -> SkillAdaptationHook:
"""Get or create an adaptation hook for an agent.
Args:
run_id: Run identifier
agent_id: Agent identifier
thresholds: Optional custom thresholds
Returns:
SkillAdaptationHook instance
"""
key = f"{run_id}:{agent_id}"
if key not in self._hooks:
self._hooks[key] = SkillAdaptationHook(
storage_dir=self.storage_dir,
run_id=run_id,
agent_id=agent_id,
thresholds=thresholds,
)
return self._hooks[key]
def get_all_pending_warnings(self) -> List[AdaptationEvent]:
"""Get all pending warnings from all hooks."""
warnings = []
for hook in self._hooks.values():
warnings.extend(hook.get_pending_warnings())
return warnings
def get_run_adaptations(self, run_id: str) -> List[AdaptationEvent]:
"""Get all adaptation events for a run."""
events = []
for hook in self._hooks.values():
if hook.run_id == run_id:
events.extend(hook.get_recent_events())
return events
# Global manager instance
_adaptation_manager: Optional[AdaptationManager] = None
def get_adaptation_manager(storage_dir: Optional[Path] = None) -> AdaptationManager:
"""Get global adaptation manager instance.
Args:
storage_dir: Optional storage directory (required on first call)
Returns:
AdaptationManager instance
"""
global _adaptation_manager
if _adaptation_manager is None:
if storage_dir is None:
raise ValueError("storage_dir required on first initialization")
_adaptation_manager = AdaptationManager(storage_dir)
return _adaptation_manager
__all__ = [
"AdaptationAction",
"AdaptationThreshold",
"AdaptationEvent",
"SkillAdaptationHook",
"AdaptationManager",
"get_adaptation_manager",
]