Add examples for werewolf game tuner (#96)

2026-01-16 17:25:49 +08:00
parent 9503bda45d
commit 5855c5161b
16 changed files with 2095 additions and 8 deletions
--- a/tuner/werewolves/config_train_goodguy.yaml
+++ b/tuner/werewolves/config_train_goodguy.yaml
@@ -0,0 +1,241 @@
+# ============================================
+# Project and Experiment Configuration
+# ============================================
+# Project name for grouping experiments
+project: AgentScope-Werewolves
+# Unique name for this specific experiment run (training good guys)
+name: Werewolves-7Player-GRPO-train-goodguy
+# Root directory for saving checkpoints. Uses environment variable if set, otherwise defaults to ./checkpoints
+checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}
+
+# ============================================
+# Algorithm Configuration
+# ============================================
+algorithm:
+  # Algorithm type: multi_step_grpo for multi-step Group Relative Policy Optimization
+  algorithm_type: multi_step_grpo
+
+  # KL divergence loss function for regularization during training
+  # "low_var_kl": low-variance KL loss suitable for multi-step optimization
+  kl_loss_fn: "low_var_kl"
+  kl_loss_fn_args:
+    # KL coefficient: 0 means no KL penalty (pure reward optimization)
+    kl_coef: 0
+
+  # Advantage function configuration for computing policy gradients
+  advantage_fn_args:
+    # Small epsilon value for numerical stability in advantage normalization
+    epsilon: 1e-6
+    # Normalize advantage by episode length (important for variable-length games)
+    step_norm: true
+
+  # Number of rollouts per task (group size for GRPO)
+  # Higher values provide better gradient estimates but require more compute
+  repeat_times: 32
+
+  # Policy loss function arguments (PPO-style clipping)
+  policy_loss_fn_args:
+    # Lower bound for probability ratio clipping (prevents too large policy updates)
+    clip_range_low: 0.2
+    # Upper bound for probability ratio clipping
+    clip_range_high: 0.28
+
+  # Optimizer configuration
+  optimizer:
+    # Learning rate for policy updates
+    lr: 1e-6
+
+# ============================================
+# Model Configuration
+# ============================================
+model:
+  # Path to the base model (trainable model for good guy players: villager, seer, witch)
+  # Uses environment variable if set, otherwise defaults to Qwen3-4B-Instruct-2507
+  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507}
+  # Maximum number of tokens the model can generate per response
+  max_response_tokens: 4096
+  # Total context length the model can handle (prompt + response)
+  max_model_len: 25600
+
+# ============================================
+# Cluster Configuration
+# ============================================
+cluster:
+  # Number of nodes in the Ray cluster
+  node_num: 4
+  # Number of GPUs per node (total GPUs = node_num * gpu_per_node = 32)
+  gpu_per_node: 8
+  # Ray cluster address: "auto" automatically detects the cluster
+  ray_address: auto
+
+# ============================================
+# Buffer Configuration (Data Pipeline)
+# ============================================
+buffer:
+  # Total training steps (iterations)
+  total_steps: 400
+  # Batch size for rollout collection (tasks per training step)
+  batch_size: 24
+  # Batch size for training (experiences per gradient update)
+  # For multi-step GRPO, each rollout for task has multiple steps, so we need to accumulate the experiences for gradient update.
+  # Hence we should choose a larger batch size for training (2048 > 16 * 32)
+  train_batch_size: 2048
+
+  # Explorer input configuration (rollout data source)
+  explorer_input:
+    taskset:
+      # Dataset name
+      name: werewolves
+      # Storage type: "file" reads from local file system
+      storage_type: file
+      # Path to the dataset directory containing train.jsonl
+      path: data
+      # Dataset split to use
+      split: 'train'
+      # Rollout generation arguments
+      rollout_args:
+        # Sampling temperature for exploration (1.0 = full stochasticity)
+        temperature: 1.0
+        # Maximum tokens per generation
+        max_tokens: 4096
+      # Workflow-specific arguments
+      workflow_args:
+        # Which side to train: "werewolf" or "good_guy" (villager, seer, witch)
+        # - "werewolf": Train werewolf players
+        # - "good_guy": Train villager, seer, and witch players (this config)
+        trainable_target: good_guy
+    # Evaluation datasets (empty for this experiment)
+    eval_tasksets: []
+
+  # Trainer input configuration (training data source)
+  trainer_input:
+    experience_buffer:
+      # Name of the experience buffer
+      name: werewolves
+      # Storage type: "queue" for in-memory communication between explorer and trainer
+      storage_type: queue
+      # Maximum time (seconds) to wait for data from explorer before timeout
+      max_read_timeout: 7200
+      # Replay buffer configuration for experience reuse
+      replay_buffer:
+        # Enable replay buffer to reuse past experiences
+        enable: true
+
+# ============================================
+# Explorer Configuration (Rollout Generation)
+# ============================================
+explorer:
+  # Number of parallel workflow runners per rollout model instance
+  # Higher values = more parallel game simulations
+  runner_per_model: 16
+  # Maximum time (seconds) to wait for a single task completion
+  max_timeout: 3600
+  # Number of retries if a task fails or times out (0 = no retry)
+  max_retry_times: 0
+  # Number of times each task is repeated within a single runner
+  # (usually 1; repeat_times is handled at algorithm level)
+  max_repeat_times_per_runner: 1
+
+  # Over-rollout configuration (collect more data than needed)
+  over_rollout:
+    # Ratio of tasks to skip waiting for (0.2 = wait for 80% of batch_size, then proceed)
+    # Improves throughput by not waiting for slow tasks
+    ratio: 0.2
+    # Minimum wait time (seconds) after reaching the threshold before proceeding
+    wait_after_min: 15
+
+  # Dynamic timeout configuration (adaptive timeout based on task completion time)
+  dynamic_timeout:
+    # Enable dynamic timeout adjustment
+    enable: true
+    # Timeout multiplier: timeout = average_task_time * ratio
+    ratio: 4
+
+  # Rollout model configuration (trainable model for good guy players)
+  rollout_model:
+    # Number of vLLM engine instances for parallel inference
+    engine_num: 16
+    # Tensor parallelism size (GPUs per engine instance)
+    tensor_parallel_size: 1
+    # Disable KV cache prefix sharing (usually False for training stability)
+    enable_prefix_caching: false
+    # Disable CUDA graph optimization (False = use CUDA graphs for faster inference)
+    enforce_eager: false
+    # Enable OpenAI-compatible API interface for vLLM
+    enable_openai_api: true
+    # Enable conversation history tracking
+    enable_history: true
+    # Enable automatic tool choice in function calling
+    enable_auto_tool_choice: true
+    # Tool call parser for structured outputs (hermes format)
+    tool_call_parser: hermes
+    # Data type for model weights and activations
+    dtype: bfloat16
+    # Random seed for reproducibility
+    seed: 42
+
+  # Auxiliary models configuration (for werewolf players when training good guys)
+  auxiliary_models:
+    - # Model name identifier
+      name: participant
+      # Path to auxiliary model (stronger model for stable baseline opponents)
+      model_path: ${oc.env:TRINITY_AUXILIARY_MODEL_PATH,Qwen/Qwen3-30B-A3B-Instruct-2507}
+      # Number of vLLM engines for auxiliary model (fewer than rollout model)
+      engine_num: 8
+      # Tensor parallelism size
+      tensor_parallel_size: 1
+      # Disable prefix caching
+      enable_prefix_caching: false
+      # Disable CUDA graph optimization
+      enforce_eager: false
+      # Enable OpenAI API interface
+      enable_openai_api: true
+      # Enable automatic tool choice
+      enable_auto_tool_choice: true
+      # Tool call parser
+      tool_call_parser: hermes
+      # Data type
+      dtype: bfloat16
+      # Random seed
+      seed: 42
+
+# ============================================
+# Synchronizer Configuration (Weight Sync)
+# ============================================
+synchronizer:
+  # Synchronization style: "dynamic_by_explorer" = sync when explorer requests
+  # Alternatives: "fixed" (sync every N steps), "dynamic_by_trainer"
+  sync_style: dynamic_by_explorer
+  # Synchronization method: "nccl" uses NVIDIA NCCL for fast GPU-to-GPU communication
+  # Alternatives: "checkpoint" (slower, file-based sync)
+  sync_method: 'nccl'
+  # Sync weights every N training steps
+  sync_interval: 1
+  # Timeout (seconds) for weight synchronization before failing
+  sync_timeout: 72000
+
+# ============================================
+# Trainer Configuration (Model Training)
+# ============================================
+trainer:
+  # Save checkpoint every N training steps (0 = only save at end)
+  save_interval: 100
+  # Gradient clipping threshold to prevent exploding gradients
+  grad_clip: 1.0
+  # Use dynamic batch size to maximize GPU memory utilization
+  use_dynamic_bsz: true
+  # Maximum token length per GPU for memory management
+  # Higher = more efficient but requires more VRAM
+  max_token_len_per_gpu: 16384
+  # Ulysses sequence parallelism size for handling long sequences
+  # 2 = split sequence across 2 GPUs
+  ulysses_sequence_parallel_size: 2
+
+# ============================================
+# Monitor Configuration (Logging & Tracking)
+# ============================================
+monitor:
+  # Monitoring/logging backend: "wandb" for Weights & Biases
+  # Alternatives: "tensorboard"
+  monitor_type: wandb
+