Add examples for werewolf game tuner (#96)

This commit is contained in:
garyzhang99
2026-01-16 17:25:49 +08:00
committed by GitHub
parent 9503bda45d
commit 5855c5161b
16 changed files with 2095 additions and 8 deletions

View File

@@ -0,0 +1,241 @@
# ============================================
# Project and Experiment Configuration
# ============================================
# Project name for grouping experiments
project: AgentScope-Werewolves
# Unique name for this specific experiment run (training good guys)
name: Werewolves-7Player-GRPO-train-goodguy
# Root directory for saving checkpoints. Uses environment variable if set, otherwise defaults to ./checkpoints
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
# ============================================
# Algorithm Configuration
# ============================================
algorithm:
# Algorithm type: multi_step_grpo for multi-step Group Relative Policy Optimization
algorithm_type: multi_step_grpo
# KL divergence loss function for regularization during training
# "low_var_kl": low-variance KL loss suitable for multi-step optimization
kl_loss_fn: "low_var_kl"
kl_loss_fn_args:
# KL coefficient: 0 means no KL penalty (pure reward optimization)
kl_coef: 0
# Advantage function configuration for computing policy gradients
advantage_fn_args:
# Small epsilon value for numerical stability in advantage normalization
epsilon: 1e-6
# Normalize advantage by episode length (important for variable-length games)
step_norm: true
# Number of rollouts per task (group size for GRPO)
# Higher values provide better gradient estimates but require more compute
repeat_times: 32
# Policy loss function arguments (PPO-style clipping)
policy_loss_fn_args:
# Lower bound for probability ratio clipping (prevents too large policy updates)
clip_range_low: 0.2
# Upper bound for probability ratio clipping
clip_range_high: 0.28
# Optimizer configuration
optimizer:
# Learning rate for policy updates
lr: 1e-6
# ============================================
# Model Configuration
# ============================================
model:
# Path to the base model (trainable model for good guy players: villager, seer, witch)
# Uses environment variable if set, otherwise defaults to Qwen3-4B-Instruct-2507
model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507}
# Maximum number of tokens the model can generate per response
max_response_tokens: 4096
# Total context length the model can handle (prompt + response)
max_model_len: 25600
# ============================================
# Cluster Configuration
# ============================================
cluster:
# Number of nodes in the Ray cluster
node_num: 4
# Number of GPUs per node (total GPUs = node_num * gpu_per_node = 32)
gpu_per_node: 8
# Ray cluster address: "auto" automatically detects the cluster
ray_address: auto
# ============================================
# Buffer Configuration (Data Pipeline)
# ============================================
buffer:
# Total training steps (iterations)
total_steps: 400
# Batch size for rollout collection (tasks per training step)
batch_size: 24
# Batch size for training (experiences per gradient update)
# For multi-step GRPO, each rollout for task has multiple steps, so we need to accumulate the experiences for gradient update.
# Hence we should choose a larger batch size for training (2048 > 16 * 32)
train_batch_size: 2048
# Explorer input configuration (rollout data source)
explorer_input:
taskset:
# Dataset name
name: werewolves
# Storage type: "file" reads from local file system
storage_type: file
# Path to the dataset directory containing train.jsonl
path: data
# Dataset split to use
split: 'train'
# Rollout generation arguments
rollout_args:
# Sampling temperature for exploration (1.0 = full stochasticity)
temperature: 1.0
# Maximum tokens per generation
max_tokens: 4096
# Workflow-specific arguments
workflow_args:
# Which side to train: "werewolf" or "good_guy" (villager, seer, witch)
# - "werewolf": Train werewolf players
# - "good_guy": Train villager, seer, and witch players (this config)
trainable_target: good_guy
# Evaluation datasets (empty for this experiment)
eval_tasksets: []
# Trainer input configuration (training data source)
trainer_input:
experience_buffer:
# Name of the experience buffer
name: werewolves
# Storage type: "queue" for in-memory communication between explorer and trainer
storage_type: queue
# Maximum time (seconds) to wait for data from explorer before timeout
max_read_timeout: 7200
# Replay buffer configuration for experience reuse
replay_buffer:
# Enable replay buffer to reuse past experiences
enable: true
# ============================================
# Explorer Configuration (Rollout Generation)
# ============================================
explorer:
# Number of parallel workflow runners per rollout model instance
# Higher values = more parallel game simulations
runner_per_model: 16
# Maximum time (seconds) to wait for a single task completion
max_timeout: 3600
# Number of retries if a task fails or times out (0 = no retry)
max_retry_times: 0
# Number of times each task is repeated within a single runner
# (usually 1; repeat_times is handled at algorithm level)
max_repeat_times_per_runner: 1
# Over-rollout configuration (collect more data than needed)
over_rollout:
# Ratio of tasks to skip waiting for (0.2 = wait for 80% of batch_size, then proceed)
# Improves throughput by not waiting for slow tasks
ratio: 0.2
# Minimum wait time (seconds) after reaching the threshold before proceeding
wait_after_min: 15
# Dynamic timeout configuration (adaptive timeout based on task completion time)
dynamic_timeout:
# Enable dynamic timeout adjustment
enable: true
# Timeout multiplier: timeout = average_task_time * ratio
ratio: 4
# Rollout model configuration (trainable model for good guy players)
rollout_model:
# Number of vLLM engine instances for parallel inference
engine_num: 16
# Tensor parallelism size (GPUs per engine instance)
tensor_parallel_size: 1
# Disable KV cache prefix sharing (usually False for training stability)
enable_prefix_caching: false
# Disable CUDA graph optimization (False = use CUDA graphs for faster inference)
enforce_eager: false
# Enable OpenAI-compatible API interface for vLLM
enable_openai_api: true
# Enable conversation history tracking
enable_history: true
# Enable automatic tool choice in function calling
enable_auto_tool_choice: true
# Tool call parser for structured outputs (hermes format)
tool_call_parser: hermes
# Data type for model weights and activations
dtype: bfloat16
# Random seed for reproducibility
seed: 42
# Auxiliary models configuration (for werewolf players when training good guys)
auxiliary_models:
- # Model name identifier
name: participant
# Path to auxiliary model (stronger model for stable baseline opponents)
model_path: ${oc.env:TRINITY_AUXILIARY_MODEL_PATH,Qwen/Qwen3-30B-A3B-Instruct-2507}
# Number of vLLM engines for auxiliary model (fewer than rollout model)
engine_num: 8
# Tensor parallelism size
tensor_parallel_size: 1
# Disable prefix caching
enable_prefix_caching: false
# Disable CUDA graph optimization
enforce_eager: false
# Enable OpenAI API interface
enable_openai_api: true
# Enable automatic tool choice
enable_auto_tool_choice: true
# Tool call parser
tool_call_parser: hermes
# Data type
dtype: bfloat16
# Random seed
seed: 42
# ============================================
# Synchronizer Configuration (Weight Sync)
# ============================================
synchronizer:
# Synchronization style: "dynamic_by_explorer" = sync when explorer requests
# Alternatives: "fixed" (sync every N steps), "dynamic_by_trainer"
sync_style: dynamic_by_explorer
# Synchronization method: "nccl" uses NVIDIA NCCL for fast GPU-to-GPU communication
# Alternatives: "checkpoint" (slower, file-based sync)
sync_method: 'nccl'
# Sync weights every N training steps
sync_interval: 1
# Timeout (seconds) for weight synchronization before failing
sync_timeout: 72000
# ============================================
# Trainer Configuration (Model Training)
# ============================================
trainer:
# Save checkpoint every N training steps (0 = only save at end)
save_interval: 100
# Gradient clipping threshold to prevent exploding gradients
grad_clip: 1.0
# Use dynamic batch size to maximize GPU memory utilization
use_dynamic_bsz: true
# Maximum token length per GPU for memory management
# Higher = more efficient but requires more VRAM
max_token_len_per_gpu: 16384
# Ulysses sequence parallelism size for handling long sequences
# 2 = split sequence across 2 GPUs
ulysses_sequence_parallel_size: 2
# ============================================
# Monitor Configuration (Logging & Tracking)
# ============================================
monitor:
# Monitoring/logging backend: "wandb" for Weights & Biases
# Alternatives: "tensorboard"
monitor_type: wandb