242 lines
9.1 KiB
YAML
242 lines
9.1 KiB
YAML
# ============================================
|
|
# Project and Experiment Configuration
|
|
# ============================================
|
|
# Project name for grouping experiments
|
|
project: AgentScope-Werewolves
|
|
# Unique name for this specific experiment run
|
|
name: Werewolves-7Player-GRPO
|
|
# Root directory for saving checkpoints. Uses environment variable if set, otherwise defaults to ./checkpoints
|
|
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
|
|
|
|
# ============================================
|
|
# Algorithm Configuration
|
|
# ============================================
|
|
algorithm:
|
|
# Algorithm type: multi_step_grpo for multi-step Group Relative Policy Optimization
|
|
algorithm_type: multi_step_grpo
|
|
|
|
# KL divergence loss function for regularization during training
|
|
# "low_var_kl": low-variance KL loss suitable for multi-step optimization
|
|
kl_loss_fn: "low_var_kl"
|
|
kl_loss_fn_args:
|
|
# KL coefficient: 0 means no KL penalty (pure reward optimization)
|
|
kl_coef: 0
|
|
|
|
# Advantage function configuration for computing policy gradients
|
|
advantage_fn_args:
|
|
# Small epsilon value for numerical stability in advantage normalization
|
|
epsilon: 1e-6
|
|
# Normalize advantage by episode length (important for variable-length games)
|
|
step_norm: true
|
|
|
|
# Number of rollouts per task (group size for GRPO)
|
|
# Higher values provide better gradient estimates but require more compute
|
|
repeat_times: 32
|
|
|
|
# Policy loss function arguments (PPO-style clipping)
|
|
policy_loss_fn_args:
|
|
# Lower bound for probability ratio clipping (prevents too large policy updates)
|
|
clip_range_low: 0.2
|
|
# Upper bound for probability ratio clipping
|
|
clip_range_high: 0.28
|
|
|
|
# Optimizer configuration
|
|
optimizer:
|
|
# Learning rate for policy updates
|
|
lr: 1e-6
|
|
|
|
# ============================================
|
|
# Model Configuration
|
|
# ============================================
|
|
model:
|
|
# Path to the base model (trainable model for werewolf players)
|
|
# Uses environment variable if set, otherwise defaults to Qwen2.5-7B-Instruct
|
|
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
|
|
# Maximum number of tokens the model can generate per response
|
|
max_response_tokens: 4096
|
|
# Total context length the model can handle (prompt + response)
|
|
max_model_len: 25600
|
|
|
|
# ============================================
|
|
# Cluster Configuration
|
|
# ============================================
|
|
cluster:
|
|
# Number of nodes in the Ray cluster
|
|
node_num: 4
|
|
# Number of GPUs per node (total GPUs = node_num * gpu_per_node = 32)
|
|
gpu_per_node: 8
|
|
# Ray cluster address: "auto" automatically detects the cluster
|
|
ray_address: auto
|
|
|
|
# ============================================
|
|
# Buffer Configuration (Data Pipeline)
|
|
# ============================================
|
|
buffer:
|
|
# Total training steps (iterations)
|
|
total_steps: 400
|
|
# Batch size for rollout collection (tasks per training step)
|
|
batch_size: 24
|
|
# Batch size for training (experiences per gradient update)
|
|
# For multi-step GRPO, each rollout for task has multiple steps, so we need to accumulate the experiences for gradient update.
|
|
# Hence we should choose a larger batch size for training (2048 > 16 * 32)
|
|
train_batch_size: 2048
|
|
|
|
# Explorer input configuration (rollout data source)
|
|
explorer_input:
|
|
taskset:
|
|
# Dataset name
|
|
name: werewolves
|
|
# Storage type: "file" reads from local file system
|
|
storage_type: file
|
|
# Path to the dataset directory containing train.jsonl
|
|
path: data
|
|
# Dataset split to use
|
|
split: 'train'
|
|
# Rollout generation arguments
|
|
rollout_args:
|
|
# Sampling temperature for exploration (1.0 = full stochasticity)
|
|
temperature: 1.0
|
|
# Maximum tokens per generation
|
|
max_tokens: 4096
|
|
# Workflow-specific arguments
|
|
workflow_args:
|
|
# Which side to train: "werewolf" or "good_guy" (villager, seer, witch)
|
|
# - "werewolf": Train werewolf players (default)
|
|
# - "good_guy": Train villager, seer, and witch players
|
|
trainable_target: werewolf
|
|
# Evaluation datasets (empty for this experiment)
|
|
eval_tasksets: []
|
|
|
|
# Trainer input configuration (training data source)
|
|
trainer_input:
|
|
experience_buffer:
|
|
# Name of the experience buffer
|
|
name: werewolves
|
|
# Storage type: "queue" for in-memory communication between explorer and trainer
|
|
storage_type: queue
|
|
# Maximum time (seconds) to wait for data from explorer before timeout
|
|
max_read_timeout: 7200
|
|
# Replay buffer configuration for experience reuse
|
|
replay_buffer:
|
|
# Enable replay buffer to reuse past experiences
|
|
enable: true
|
|
|
|
# ============================================
|
|
# Explorer Configuration (Rollout Generation)
|
|
# ============================================
|
|
explorer:
|
|
# Number of parallel workflow runners per rollout model instance
|
|
# Higher values = more parallel game simulations
|
|
runner_per_model: 16
|
|
# Maximum time (seconds) to wait for a single task completion
|
|
max_timeout: 3600
|
|
# Number of retries if a task fails or times out (0 = no retry)
|
|
max_retry_times: 0
|
|
# Number of times each task is repeated within a single runner
|
|
# (usually 1; repeat_times is handled at algorithm level)
|
|
max_repeat_times_per_runner: 1
|
|
|
|
# Over-rollout configuration (collect more data than needed)
|
|
over_rollout:
|
|
# Ratio of tasks to skip waiting for (0.2 = wait for 80% of batch_size, then proceed)
|
|
# Improves throughput by not waiting for slow tasks
|
|
ratio: 0.2
|
|
# Minimum wait time (seconds) after reaching the threshold before proceeding
|
|
wait_after_min: 15
|
|
|
|
# Dynamic timeout configuration (adaptive timeout based on task completion time)
|
|
dynamic_timeout:
|
|
# Enable dynamic timeout adjustment
|
|
enable: true
|
|
# Timeout multiplier: timeout = average_task_time * ratio
|
|
ratio: 4
|
|
|
|
# Rollout model configuration (trainable model for werewolf players)
|
|
rollout_model:
|
|
# Number of vLLM engine instances for parallel inference
|
|
engine_num: 16
|
|
# Tensor parallelism size (GPUs per engine instance)
|
|
tensor_parallel_size: 1
|
|
# Disable KV cache prefix sharing (usually False for training stability)
|
|
enable_prefix_caching: false
|
|
# Disable CUDA graph optimization (False = use CUDA graphs for faster inference)
|
|
enforce_eager: false
|
|
# Enable OpenAI-compatible API interface for vLLM
|
|
enable_openai_api: true
|
|
# Enable conversation history tracking
|
|
enable_history: true
|
|
# Enable automatic tool choice in function calling
|
|
enable_auto_tool_choice: true
|
|
# Tool call parser for structured outputs (hermes format)
|
|
tool_call_parser: hermes
|
|
# Data type for model weights and activations
|
|
dtype: bfloat16
|
|
# Random seed for reproducibility
|
|
seed: 42
|
|
|
|
# Auxiliary models configuration (for non-werewolf players: villagers, seer, witch)
|
|
auxiliary_models:
|
|
- # Model name identifier
|
|
name: participant
|
|
# Path to auxiliary model (stronger model for stable baseline opponents)
|
|
model_path: ${oc.env:TRINITY_AUXILIARY_MODEL_PATH,Qwen/Qwen3-30B-A3B-Instruct-2507}
|
|
# Number of vLLM engines for auxiliary model (fewer than rollout model)
|
|
engine_num: 8
|
|
# Tensor parallelism size
|
|
tensor_parallel_size: 1
|
|
# Disable prefix caching
|
|
enable_prefix_caching: false
|
|
# Disable CUDA graph optimization
|
|
enforce_eager: false
|
|
# Enable OpenAI API interface
|
|
enable_openai_api: true
|
|
# Enable automatic tool choice
|
|
enable_auto_tool_choice: true
|
|
# Tool call parser
|
|
tool_call_parser: hermes
|
|
# Data type
|
|
dtype: bfloat16
|
|
# Random seed
|
|
seed: 42
|
|
|
|
# ============================================
|
|
# Synchronizer Configuration (Weight Sync)
|
|
# ============================================
|
|
synchronizer:
|
|
# Synchronization style: "dynamic_by_explorer" = sync when explorer requests
|
|
# Alternatives: "fixed" (sync every N steps), "dynamic_by_trainer"
|
|
sync_style: dynamic_by_explorer
|
|
# Synchronization method: "nccl" uses NVIDIA NCCL for fast GPU-to-GPU communication
|
|
# Alternatives: "checkpoint" (slower, file-based sync)
|
|
sync_method: 'nccl'
|
|
# Sync weights every N training steps
|
|
sync_interval: 1
|
|
# Timeout (seconds) for weight synchronization before failing
|
|
sync_timeout: 72000
|
|
|
|
# ============================================
|
|
# Trainer Configuration (Model Training)
|
|
# ============================================
|
|
trainer:
|
|
# Save checkpoint every N training steps (0 = only save at end)
|
|
save_interval: 100
|
|
# Gradient clipping threshold to prevent exploding gradients
|
|
grad_clip: 1.0
|
|
# Use dynamic batch size to maximize GPU memory utilization
|
|
use_dynamic_bsz: true
|
|
# Maximum token length per GPU for memory management
|
|
# Higher = more efficient but requires more VRAM
|
|
max_token_len_per_gpu: 16384
|
|
# Ulysses sequence parallelism size for handling long sequences
|
|
# 2 = split sequence across 2 GPUs
|
|
ulysses_sequence_parallel_size: 2
|
|
|
|
# ============================================
|
|
# Monitor Configuration (Logging & Tracking)
|
|
# ============================================
|
|
monitor:
|
|
# Monitoring/logging backend: "wandb" for Weights & Biases
|
|
# Alternatives: "tensorboard"
|
|
monitor_type: wandb
|
|
|