evotraders/tuner/werewolves/config.yaml

# ============================================
# Project and Experiment Configuration
# ============================================
# Project name for grouping experiments
project: AgentScope-Werewolves
# Unique name for this specific experiment run
name: Werewolves-7Player-GRPO
# Root directory for saving checkpoints. Uses environment variable if set, otherwise defaults to ./checkpoints
checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints}

# ============================================
# Algorithm Configuration
# ============================================
algorithm:
  # Algorithm type: multi_step_grpo for multi-step Group Relative Policy Optimization
  algorithm_type: multi_step_grpo

  # KL divergence loss function for regularization during training
  # "low_var_kl": low-variance KL loss suitable for multi-step optimization
  kl_loss_fn: "low_var_kl"
  kl_loss_fn_args:
    # KL coefficient: 0 means no KL penalty (pure reward optimization)
    kl_coef: 0

  # Advantage function configuration for computing policy gradients
  advantage_fn_args:
    # Small epsilon value for numerical stability in advantage normalization
    epsilon: 1e-6
    # Normalize advantage by episode length (important for variable-length games)
    step_norm: true

  # Number of rollouts per task (group size for GRPO)
  # Higher values provide better gradient estimates but require more compute
  repeat_times: 32

  # Policy loss function arguments (PPO-style clipping)
  policy_loss_fn_args:
    # Lower bound for probability ratio clipping (prevents too large policy updates)
    clip_range_low: 0.2
    # Upper bound for probability ratio clipping
    clip_range_high: 0.28

  # Optimizer configuration
  optimizer:
    # Learning rate for policy updates
    lr: 1e-6

# ============================================
# Model Configuration
# ============================================
model:
  # Path to the base model (trainable model for werewolf players)
  # Uses environment variable if set, otherwise defaults to Qwen2.5-7B-Instruct
  model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct}
  # Maximum number of tokens the model can generate per response
  max_response_tokens: 4096
  # Total context length the model can handle (prompt + response)
  max_model_len: 25600

# ============================================
# Cluster Configuration
# ============================================
cluster:
  # Number of nodes in the Ray cluster
  node_num: 4
  # Number of GPUs per node (total GPUs = node_num * gpu_per_node = 32)
  gpu_per_node: 8
  # Ray cluster address: "auto" automatically detects the cluster
  ray_address: auto

# ============================================
# Buffer Configuration (Data Pipeline)
# ============================================
buffer:
  # Total training steps (iterations)
  total_steps: 400
  # Batch size for rollout collection (tasks per training step)
  batch_size: 24
  # Batch size for training (experiences per gradient update)
  # For multi-step GRPO, each rollout for task has multiple steps, so we need to accumulate the experiences for gradient update.
  # Hence we should choose a larger batch size for training (2048 > 16 * 32)
  train_batch_size: 2048

  # Explorer input configuration (rollout data source)
  explorer_input:
    taskset:
      # Dataset name
      name: werewolves
      # Storage type: "file" reads from local file system
      storage_type: file
      # Path to the dataset directory containing train.jsonl
      path: data
      # Dataset split to use
      split: 'train'
      # Rollout generation arguments
      rollout_args:
        # Sampling temperature for exploration (1.0 = full stochasticity)
        temperature: 1.0
        # Maximum tokens per generation
        max_tokens: 4096
      # Workflow-specific arguments
      workflow_args:
        # Which side to train: "werewolf" or "good_guy" (villager, seer, witch)
        # - "werewolf": Train werewolf players (default)
        # - "good_guy": Train villager, seer, and witch players
        trainable_target: werewolf
    # Evaluation datasets (empty for this experiment)
    eval_tasksets: []

  # Trainer input configuration (training data source)
  trainer_input:
    experience_buffer:
      # Name of the experience buffer
      name: werewolves
      # Storage type: "queue" for in-memory communication between explorer and trainer
      storage_type: queue
      # Maximum time (seconds) to wait for data from explorer before timeout
      max_read_timeout: 7200
      # Replay buffer configuration for experience reuse
      replay_buffer:
        # Enable replay buffer to reuse past experiences
        enable: true

# ============================================
# Explorer Configuration (Rollout Generation)
# ============================================
explorer:
  # Number of parallel workflow runners per rollout model instance
  # Higher values = more parallel game simulations
  runner_per_model: 16
  # Maximum time (seconds) to wait for a single task completion
  max_timeout: 3600
  # Number of retries if a task fails or times out (0 = no retry)
  max_retry_times: 0
  # Number of times each task is repeated within a single runner
  # (usually 1; repeat_times is handled at algorithm level)
  max_repeat_times_per_runner: 1

  # Over-rollout configuration (collect more data than needed)
  over_rollout:
    # Ratio of tasks to skip waiting for (0.2 = wait for 80% of batch_size, then proceed)
    # Improves throughput by not waiting for slow tasks
    ratio: 0.2
    # Minimum wait time (seconds) after reaching the threshold before proceeding
    wait_after_min: 15

  # Dynamic timeout configuration (adaptive timeout based on task completion time)
  dynamic_timeout:
    # Enable dynamic timeout adjustment
    enable: true
    # Timeout multiplier: timeout = average_task_time * ratio
    ratio: 4

  # Rollout model configuration (trainable model for werewolf players)
  rollout_model:
    # Number of vLLM engine instances for parallel inference
    engine_num: 16
    # Tensor parallelism size (GPUs per engine instance)
    tensor_parallel_size: 1
    # Disable KV cache prefix sharing (usually False for training stability)
    enable_prefix_caching: false
    # Disable CUDA graph optimization (False = use CUDA graphs for faster inference)
    enforce_eager: false
    # Enable OpenAI-compatible API interface for vLLM
    enable_openai_api: true
    # Enable conversation history tracking
    enable_history: true
    # Enable automatic tool choice in function calling
    enable_auto_tool_choice: true
    # Tool call parser for structured outputs (hermes format)
    tool_call_parser: hermes
    # Data type for model weights and activations
    dtype: bfloat16
    # Random seed for reproducibility
    seed: 42

  # Auxiliary models configuration (for non-werewolf players: villagers, seer, witch)
  auxiliary_models:
    - # Model name identifier
      name: participant
      # Path to auxiliary model (stronger model for stable baseline opponents)
      model_path: ${oc.env:TRINITY_AUXILIARY_MODEL_PATH,Qwen/Qwen3-30B-A3B-Instruct-2507}
      # Number of vLLM engines for auxiliary model (fewer than rollout model)
      engine_num: 8
      # Tensor parallelism size
      tensor_parallel_size: 1
      # Disable prefix caching
      enable_prefix_caching: false
      # Disable CUDA graph optimization
      enforce_eager: false
      # Enable OpenAI API interface
      enable_openai_api: true
      # Enable automatic tool choice
      enable_auto_tool_choice: true
      # Tool call parser
      tool_call_parser: hermes
      # Data type
      dtype: bfloat16
      # Random seed
      seed: 42

# ============================================
# Synchronizer Configuration (Weight Sync)
# ============================================
synchronizer:
  # Synchronization style: "dynamic_by_explorer" = sync when explorer requests
  # Alternatives: "fixed" (sync every N steps), "dynamic_by_trainer"
  sync_style: dynamic_by_explorer
  # Synchronization method: "nccl" uses NVIDIA NCCL for fast GPU-to-GPU communication
  # Alternatives: "checkpoint" (slower, file-based sync)
  sync_method: 'nccl'
  # Sync weights every N training steps
  sync_interval: 1
  # Timeout (seconds) for weight synchronization before failing
  sync_timeout: 72000

# ============================================
# Trainer Configuration (Model Training)
# ============================================
trainer:
  # Save checkpoint every N training steps (0 = only save at end)
  save_interval: 100
  # Gradient clipping threshold to prevent exploding gradients
  grad_clip: 1.0
  # Use dynamic batch size to maximize GPU memory utilization
  use_dynamic_bsz: true
  # Maximum token length per GPU for memory management
  # Higher = more efficient but requires more VRAM
  max_token_len_per_gpu: 16384
  # Ulysses sequence parallelism size for handling long sequences
  # 2 = split sequence across 2 GPUs
  ulysses_sequence_parallel_size: 2

# ============================================
# Monitor Configuration (Logging & Tracking)
# ============================================
monitor:
  # Monitoring/logging backend: "wandb" for Weights & Biases
  # Alternatives: "tensorboard"
  monitor_type: wandb