373 lines
7.8 KiB
ReStructuredText
373 lines
7.8 KiB
ReStructuredText
Monitoring & Alerts
|
|
===================
|
|
|
|
OpenClaw provides comprehensive monitoring and alerting capabilities to track system health and trading performance.
|
|
|
|
Overview
|
|
--------
|
|
|
|
Monitoring Components
|
|
~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
* **Metrics Collection**: Performance and system metrics
|
|
* **Alerting**: Real-time notifications for critical events
|
|
* **Dashboards**: Visual monitoring interface
|
|
* **Logging**: Structured logging for debugging
|
|
* **Health Checks**: System availability monitoring
|
|
|
|
Quick Start
|
|
-----------
|
|
|
|
Basic Monitoring
|
|
~~~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.metrics import MetricsCollector
|
|
|
|
# Create collector
|
|
collector = MetricsCollector()
|
|
|
|
# Record metric
|
|
collector.record("trade.pnl", value=150.0, tags={
|
|
"symbol": "AAPL",
|
|
"strategy": "trend_following"
|
|
})
|
|
|
|
# Get statistics
|
|
stats = collector.get_stats("trade.pnl")
|
|
print(f"Avg PnL: {stats.mean:.2f}")
|
|
|
|
Setting Up Alerts
|
|
~~~~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.alerts import AlertManager, AlertRule
|
|
|
|
# Create alert manager
|
|
alerts = AlertManager()
|
|
|
|
# Define alert rule
|
|
rule = AlertRule(
|
|
name="high_drawdown",
|
|
condition="drawdown > 0.10",
|
|
severity="critical",
|
|
channels=["email", "slack"]
|
|
)
|
|
|
|
# Add rule
|
|
alerts.add_rule(rule)
|
|
|
|
# Check conditions
|
|
alerts.check_all(agent_state)
|
|
|
|
Metrics Collection
|
|
------------------
|
|
|
|
Built-in Metrics
|
|
~~~~~~~~~~~~~~~~
|
|
|
|
Trading Metrics:
|
|
|
|
* Trade count and frequency
|
|
* Win/loss ratio
|
|
* Average profit/loss
|
|
* Sharpe ratio
|
|
* Maximum drawdown
|
|
* Position sizes
|
|
|
|
System Metrics:
|
|
|
|
* API latency
|
|
* Error rates
|
|
* Decision costs
|
|
* Agent survival rates
|
|
* Workflow execution time
|
|
|
|
Custom Metrics
|
|
~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.metrics import Metric
|
|
|
|
# Create custom metric
|
|
custom_metric = Metric(
|
|
name="custom_factor.performance",
|
|
type="gauge",
|
|
description="Performance of custom trading factor",
|
|
unit="percent"
|
|
)
|
|
|
|
# Record value
|
|
custom_metric.record(15.5, tags={
|
|
"factor_name": "my_factor",
|
|
"symbol": "AAPL"
|
|
})
|
|
|
|
Metric Types
|
|
~~~~~~~~~~~~
|
|
|
|
**Counter**: Cumulative values (e.g., total trades)
|
|
|
|
.. code-block:: python
|
|
|
|
collector.increment("trades.total", tags={"symbol": "AAPL"})
|
|
|
|
**Gauge**: Point-in-time values (e.g., current balance)
|
|
|
|
.. code-block:: python
|
|
|
|
collector.gauge("agent.balance", value=1500.0, tags={"agent_id": "agent_001"})
|
|
|
|
**Histogram**: Distribution of values (e.g., trade PnL)
|
|
|
|
.. code-block:: python
|
|
|
|
collector.histogram("trade.pnl", value=100.0)
|
|
|
|
**Timer**: Duration measurements (e.g., analysis time)
|
|
|
|
.. code-block:: python
|
|
|
|
with collector.timer("analysis.duration"):
|
|
result = agent.analyze("AAPL")
|
|
|
|
Alerting System
|
|
---------------
|
|
|
|
Alert Rules
|
|
~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.alerts import AlertRule, AlertCondition
|
|
|
|
# Create rule with multiple conditions
|
|
rule = AlertRule(
|
|
name="agent_distress",
|
|
description="Agent is in critical condition",
|
|
conditions=[
|
|
AlertCondition(
|
|
metric="agent.balance",
|
|
operator="less_than",
|
|
threshold=300.0
|
|
),
|
|
AlertCondition(
|
|
metric="agent.drawdown",
|
|
operator="greater_than",
|
|
threshold=0.70
|
|
)
|
|
],
|
|
severity="critical",
|
|
cooldown_minutes=60
|
|
)
|
|
|
|
alerts.add_rule(rule)
|
|
|
|
Alert Channels
|
|
~~~~~~~~~~~~~~
|
|
|
|
Email Alerts:
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.channels import EmailChannel
|
|
|
|
email = EmailChannel(
|
|
smtp_server="smtp.gmail.com",
|
|
smtp_port=587,
|
|
username="alerts@example.com",
|
|
password="app_password"
|
|
)
|
|
|
|
alerts.register_channel("email", email)
|
|
|
|
Slack Alerts:
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.channels import SlackChannel
|
|
|
|
slack = SlackChannel(
|
|
webhook_url="https://hooks.slack.com/services/YOUR/WEBHOOK/URL"
|
|
)
|
|
|
|
alerts.register_channel("slack", slack)
|
|
|
|
Webhook Alerts:
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.channels import WebhookChannel
|
|
|
|
webhook = WebhookChannel(
|
|
url="https://api.example.com/alerts",
|
|
headers={"Authorization": "Bearer token123"}
|
|
)
|
|
|
|
alerts.register_channel("webhook", webhook)
|
|
|
|
Alert Severity Levels
|
|
~~~~~~~~~~~~~~~~~~~~~
|
|
|
|
* **INFO**: General information, no action required
|
|
* **WARNING**: Attention needed soon
|
|
* **CRITICAL**: Immediate action required
|
|
* **EMERGENCY**: System stopping event
|
|
|
|
Dashboard
|
|
---------
|
|
|
|
Web Dashboard
|
|
~~~~~~~~~~~~~
|
|
|
|
Start the monitoring dashboard:
|
|
|
|
.. code-block:: bash
|
|
|
|
openclaw dashboard --port 8080
|
|
|
|
Access at: http://localhost:8080
|
|
|
|
Dashboard Components:
|
|
|
|
* Real-time P&L chart
|
|
* Agent status overview
|
|
* System health metrics
|
|
* Recent alerts
|
|
* Active trades
|
|
* Performance statistics
|
|
|
|
Custom Dashboards
|
|
~~~~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.dashboard.builder import DashboardBuilder
|
|
|
|
builder = DashboardBuilder()
|
|
|
|
# Add widgets
|
|
builder.add_line_chart(
|
|
title="Portfolio Value",
|
|
metric="portfolio.value",
|
|
time_range="1d"
|
|
)
|
|
|
|
builder.add_gauge(
|
|
title="Win Rate",
|
|
metric="performance.win_rate",
|
|
min_value=0,
|
|
max_value=1
|
|
)
|
|
|
|
builder.add_table(
|
|
title="Active Agents",
|
|
query="SELECT * FROM agents WHERE status='active'"
|
|
)
|
|
|
|
# Build dashboard
|
|
dashboard = builder.build()
|
|
dashboard.serve(port=8080)
|
|
|
|
Logging
|
|
-------
|
|
|
|
Structured Logging
|
|
~~~~~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.utils.logging import get_logger
|
|
|
|
logger = get_logger("my_module")
|
|
|
|
# Different log levels
|
|
logger.debug("Debug information")
|
|
logger.info("General information")
|
|
logger.warning("Warning message")
|
|
logger.error("Error occurred")
|
|
logger.critical("Critical failure")
|
|
|
|
# Structured logging
|
|
logger.info("Trade executed", extra={
|
|
"symbol": "AAPL",
|
|
"side": "buy",
|
|
"quantity": 100,
|
|
"price": 150.0
|
|
})
|
|
|
|
Log Configuration
|
|
~~~~~~~~~~~~~~~~~
|
|
|
|
.. code-block:: yaml
|
|
|
|
# config/logging.yaml
|
|
logging:
|
|
level: INFO
|
|
format: json
|
|
outputs:
|
|
- type: file
|
|
path: /var/log/openclaw/trading.log
|
|
rotation: "1 day"
|
|
retention: "30 days"
|
|
- type: console
|
|
format: text
|
|
|
|
Health Checks
|
|
-------------
|
|
|
|
System Health
|
|
~~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.health import HealthChecker
|
|
|
|
health = HealthChecker()
|
|
|
|
# Register checks
|
|
health.add_check("database", check_database_connection)
|
|
health.add_check("exchange_api", check_exchange_api)
|
|
health.add_check("data_feed", check_data_feed)
|
|
|
|
# Run checks
|
|
status = health.check_all()
|
|
|
|
if status.healthy:
|
|
print("System healthy")
|
|
else:
|
|
for check, result in status.checks.items():
|
|
if not result.healthy:
|
|
print(f"{check}: FAILED - {result.message}")
|
|
|
|
Agent Health
|
|
~~~~~~~~~~~~
|
|
|
|
.. code-block:: python
|
|
|
|
from openclaw.monitoring.health import AgentHealthMonitor
|
|
|
|
monitor = AgentHealthMonitor()
|
|
|
|
# Check agent health
|
|
for agent in agents:
|
|
health = monitor.check_agent(agent)
|
|
|
|
if health.status == "critical":
|
|
alerts.send(f"Agent {agent.agent_id} is critical")
|
|
elif health.status == "struggling":
|
|
logger.warning(f"Agent {agent.agent_id} is struggling")
|
|
|
|
Monitoring Best Practices
|
|
-------------------------
|
|
|
|
1. **Monitor key metrics**: Focus on P&L, drawdown, and survival rates
|
|
2. **Set appropriate thresholds**: Avoid alert fatigue
|
|
3. **Use cooldown periods**: Prevent alert spam
|
|
4. **Regular health checks**: Automated system verification
|
|
5. **Centralized logging**: Aggregate logs for analysis
|
|
6. **Retention policies**: Manage data storage costs
|