feat: Refactor services architecture and update project structure

- Remove Docker-based microservices (docker-compose.yml, Makefile, Dockerfiles) - Update start-dev.sh to use backend.app:app entry point - Add shared schema and client modules for service communication - Add team coordination modules (messenger, registry, task_delegator, coordinator) - Add evaluation hooks and skill adaptation hooks - Add skill template and gateway server - Update frontend WebSocket URL configuration - Add explain components for insider and technical analysis Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-20 00:57:09 +08:00
parent 4b5ac86b83
commit 5b925fbe02
27 changed files with 4213 additions and 1 deletions
--- a/backend/agents/base/evaluation_hook.py
+++ b/backend/agents/base/evaluation_hook.py
@@ -0,0 +1,452 @@
+# -*- coding: utf-8 -*-
+"""Evaluation hooks system for skills.
+
+Provides evaluation metric collection and storage for skill performance tracking.
+Based on the evaluation hooks design in SKILL_TEMPLATE.md.
+"""
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set
+
+logger = logging.getLogger(__name__)
+
+
+class MetricType(Enum):
+    """Types of evaluation metrics."""
+    HIT_RATE = "hit_rate"           # 信号命中率
+    RISK_VIOLATION = "risk_violation"  # 风控违例率
+    POSITION_DEVIATION = "position_deviation"  # 仓位偏离率
+    PnL_ATTRIBUTION = "pnl_attribution"  # P&L 归因一致性
+    SIGNAL_CONSISTENCY = "signal_consistency"  # 信号一致性
+    DECISION_LATENCY = "decision_latency"  # 决策延迟
+    TOOL_USAGE = "tool_usage"  # 工具使用率
+    CUSTOM = "custom"  # 自定义指标
+
+
+@dataclass
+class EvaluationMetric:
+    """A single evaluation metric."""
+    name: str
+    metric_type: MetricType
+    value: float
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "name": self.name,
+            "metric_type": self.metric_type.value,
+            "value": self.value,
+            "timestamp": self.timestamp,
+            "metadata": self.metadata,
+        }
+
+
+@dataclass
+class EvaluationResult:
+    """Evaluation result for a skill execution."""
+    skill_name: str
+    run_id: str
+    agent_id: str
+    metrics: List[EvaluationMetric] = field(default_factory=list)
+    inputs: Dict[str, Any] = field(default_factory=dict)
+    outputs: Dict[str, Any] = field(default_factory=dict)
+    decision: Optional[str] = None
+    success: bool = True
+    error_message: Optional[str] = None
+    started_at: Optional[str] = None
+    completed_at: Optional[str] = field(default_factory=lambda: datetime.now().isoformat())
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "skill_name": self.skill_name,
+            "run_id": self.run_id,
+            "agent_id": self.agent_id,
+            "metrics": [m.to_dict() for m in self.metrics],
+            "inputs": self.inputs,
+            "outputs": self.outputs,
+            "decision": self.decision,
+            "success": self.success,
+            "error_message": self.error_message,
+            "started_at": self.started_at,
+            "completed_at": self.completed_at,
+        }
+
+
+class EvaluationHook:
+    """Hook for collecting skill evaluation metrics.
+
+    This hook collects and stores evaluation metrics after skill execution
+    for later analysis and memory/reflection stages.
+    """
+
+    def __init__(
+        self,
+        storage_dir: Path,
+        run_id: str,
+        agent_id: str,
+    ):
+        """Initialize evaluation hook.
+
+        Args:
+            storage_dir: Directory to store evaluation results
+            run_id: Current run identifier
+            agent_id: Current agent identifier
+        """
+        self.storage_dir = Path(storage_dir)
+        self.run_id = run_id
+        self.agent_id = agent_id
+        self._current_evaluation: Optional[EvaluationResult] = None
+
+    def start_evaluation(
+        self,
+        skill_name: str,
+        inputs: Dict[str, Any],
+    ) -> None:
+        """Start a new evaluation session.
+
+        Args:
+            skill_name: Name of the skill being evaluated
+            inputs: Input parameters for the skill
+        """
+        self._current_evaluation = EvaluationResult(
+            skill_name=skill_name,
+            run_id=self.run_id,
+            agent_id=self.agent_id,
+            inputs=inputs,
+            started_at=datetime.now().isoformat(),
+        )
+        logger.debug(f"Started evaluation for skill: {skill_name}")
+
+    def add_metric(
+        self,
+        name: str,
+        metric_type: MetricType,
+        value: float,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        """Add an evaluation metric.
+
+        Args:
+            name: Metric name
+            metric_type: Type of metric
+            value: Metric value
+            metadata: Additional metadata
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring metric")
+            return
+
+        metric = EvaluationMetric(
+            name=name,
+            metric_type=metric_type,
+            value=value,
+            metadata=metadata or {},
+        )
+        self._current_evaluation.metrics.append(metric)
+        logger.debug(f"Added metric: {name} = {value}")
+
+    def add_metrics(self, metrics: List[EvaluationMetric]) -> None:
+        """Add multiple evaluation metrics at once.
+
+        Args:
+            metrics: List of metrics to add
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring metrics")
+            return
+
+        self._current_evaluation.metrics.extend(metrics)
+
+    def record_outputs(self, outputs: Dict[str, Any]) -> None:
+        """Record skill outputs.
+
+        Args:
+            outputs: Output from skill execution
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring outputs")
+            return
+
+        self._current_evaluation.outputs = outputs
+
+    def record_decision(self, decision: str) -> None:
+        """Record the final decision.
+
+        Args:
+            decision: Final decision made by the skill
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation session, ignoring decision")
+            return
+
+        self._current_evaluation.decision = decision
+
+    def complete_evaluation(
+        self,
+        success: bool = True,
+        error_message: Optional[str] = None,
+    ) -> Optional[EvaluationResult]:
+        """Complete the evaluation session and persist results.
+
+        Args:
+            success: Whether the skill execution was successful
+            error_message: Error message if failed
+
+        Returns:
+            The completed evaluation result, or None if no active evaluation
+        """
+        if self._current_evaluation is None:
+            logger.warning("No active evaluation to complete")
+            return None
+
+        self._current_evaluation.success = success
+        self._current_evaluation.error_message = error_message
+        self._current_evaluation.completed_at = datetime.now().isoformat()
+
+        # Persist to storage
+        result = self._persist_evaluation(self._current_evaluation)
+
+        self._current_evaluation = None
+        logger.debug(f"Completed evaluation for skill: {result.skill_name}")
+
+        return result
+
+    def _persist_evaluation(self, evaluation: EvaluationResult) -> EvaluationResult:
+        """Persist evaluation result to storage.
+
+        Args:
+            evaluation: Evaluation result to persist
+
+        Returns:
+            The persisted evaluation
+        """
+        # Create run-specific directory
+        run_dir = self.storage_dir / self.run_id
+        run_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create agent-specific subdirectory
+        agent_dir = run_dir / self.agent_id
+        agent_dir.mkdir(parents=True, exist_ok=True)
+
+        # Generate filename with timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        filename = f"{evaluation.skill_name}_{timestamp}.json"
+        filepath = agent_dir / filename
+
+        # Write evaluation result
+        try:
+            with open(filepath, "w", encoding="utf-8") as f:
+                json.dump(evaluation.to_dict(), f, ensure_ascii=False, indent=2)
+            logger.info(f"Persisted evaluation to: {filepath}")
+        except Exception as e:
+            logger.error(f"Failed to persist evaluation: {e}")
+
+        return evaluation
+
+    def cancel_evaluation(self) -> None:
+        """Cancel the current evaluation session without saving."""
+        if self._current_evaluation is not None:
+            logger.debug(f"Cancelled evaluation for: {self._current_evaluation.skill_name}")
+            self._current_evaluation = None
+
+
+class EvaluationCollector:
+    """Collector for aggregating evaluation metrics across runs.
+
+    Provides methods to query and analyze evaluation results.
+    """
+
+    def __init__(self, storage_dir: Path):
+        """Initialize evaluation collector.
+
+        Args:
+            storage_dir: Root directory containing evaluation results
+        """
+        self.storage_dir = Path(storage_dir)
+
+    def get_run_evaluations(
+        self,
+        run_id: str,
+        agent_id: Optional[str] = None,
+    ) -> List[EvaluationResult]:
+        """Get all evaluations for a run.
+
+        Args:
+            run_id: Run identifier
+            agent_id: Optional agent identifier to filter by
+
+        Returns:
+            List of evaluation results
+        """
+        run_dir = self.storage_dir / run_id
+        if not run_dir.exists():
+            return []
+
+        evaluations = []
+
+        agent_dirs = [run_dir / agent_id] if agent_id else run_dir.iterdir()
+
+        for agent_dir in agent_dirs:
+            if not agent_dir.is_dir():
+                continue
+
+            for eval_file in agent_dir.glob("*.json"):
+                try:
+                    with open(eval_file, "r", encoding="utf-8") as f:
+                        data = json.load(f)
+                        evaluations.append(self._parse_evaluation(data))
+                except Exception as e:
+                    logger.warning(f"Failed to load evaluation {eval_file}: {e}")
+
+        return evaluations
+
+    def get_skill_metrics(
+        self,
+        skill_name: str,
+        run_ids: Optional[List[str]] = None,
+    ) -> List[EvaluationMetric]:
+        """Get all metrics for a specific skill.
+
+        Args:
+            skill_name: Name of the skill
+            run_ids: Optional list of run IDs to filter by
+
+        Returns:
+            List of metrics for the skill
+        """
+        metrics = []
+
+        if run_ids is None:
+            run_ids = [d.name for d in self.storage_dir.iterdir() if d.is_dir()]
+
+        for run_id in run_ids:
+            evaluations = self.get_run_evaluations(run_id)
+            for eval_result in evaluations:
+                if eval_result.skill_name == skill_name:
+                    metrics.extend(eval_result.metrics)
+
+        return metrics
+
+    def calculate_skill_stats(
+        self,
+        skill_name: str,
+        metric_type: MetricType,
+        run_ids: Optional[List[str]] = None,
+    ) -> Dict[str, float]:
+        """Calculate statistics for a specific metric type.
+
+        Args:
+            skill_name: Name of the skill
+            metric_type: Type of metric to calculate
+            run_ids: Optional list of run IDs to filter by
+
+        Returns:
+            Dictionary with min, max, avg, count statistics
+        """
+        metrics = self.get_skill_metrics(skill_name, run_ids)
+        filtered = [m for m in metrics if m.metric_type == metric_type]
+
+        if not filtered:
+            return {"count": 0}
+
+        values = [m.value for m in filtered]
+        return {
+            "count": len(values),
+            "min": min(values),
+            "max": max(values),
+            "avg": sum(values) / len(values),
+        }
+
+    def _parse_evaluation(self, data: Dict[str, Any]) -> EvaluationResult:
+        """Parse evaluation data into EvaluationResult.
+
+        Args:
+            data: Raw evaluation data
+
+        Returns:
+            Parsed EvaluationResult
+        """
+        metrics = []
+        for m in data.get("metrics", []):
+            metrics.append(EvaluationMetric(
+                name=m["name"],
+                metric_type=MetricType(m["metric_type"]),
+                value=m["value"],
+                timestamp=m.get("timestamp", ""),
+                metadata=m.get("metadata", {}),
+            ))
+
+        return EvaluationResult(
+            skill_name=data["skill_name"],
+            run_id=data["run_id"],
+            agent_id=data["agent_id"],
+            metrics=metrics,
+            inputs=data.get("inputs", {}),
+            outputs=data.get("outputs", {}),
+            decision=data.get("decision"),
+            success=data.get("success", True),
+            error_message=data.get("error_message"),
+            started_at=data.get("started_at"),
+            completed_at=data.get("completed_at"),
+        )
+
+
+def parse_evaluation_hooks(skill_dir: Path) -> Dict[str, Any]:
+    """Parse evaluation hooks from SKILL.md.
+
+    Extracts the Optional: Evaluation hooks section from skill documentation.
+
+    Args:
+        skill_dir: Skill directory path
+
+    Returns:
+        Dictionary containing evaluation hook definitions
+    """
+    skill_md = skill_dir / "SKILL.md"
+    if not skill_md.exists():
+        return {}
+
+    try:
+        content = skill_md.read_text(encoding="utf-8")
+
+        # Extract evaluation hooks section
+        if "## Optional: Evaluation hooks" in content:
+            start = content.find("## Optional: Evaluation hooks")
+            # Find the next ## section or end of file
+            next_section = content.find("\n## ", start + 1)
+            if next_section == -1:
+                eval_section = content[start:]
+            else:
+                eval_section = content[start:next_section]
+
+            # Parse metrics from the section
+            metrics = []
+            for metric_type in MetricType:
+                if metric_type.value.replace("_", " ") in eval_section.lower():
+                    metrics.append(metric_type.value)
+
+            return {
+                "supported_metrics": metrics,
+                "section_content": eval_section.strip(),
+            }
+    except Exception as e:
+        logger.warning(f"Failed to parse evaluation hooks: {e}")
+
+    return {}
+
+
+__all__ = [
+    "MetricType",
+    "EvaluationMetric",
+    "EvaluationResult",
+    "EvaluationHook",
+    "EvaluationCollector",
+    "parse_evaluation_hooks",
+]