project: "AgentScope" # Project name name: "Email_search" # Experiment name checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints} # Directory to save model checkpoints algorithm: algorithm_type: multi_step_grpo # GRPO series for multi-step scenario repeat_times: 8 # Number of rollouts per prompt for advantage estimation optimizer: lr: 1e-6 # Learning rate policy_loss_fn: "rec" # Policy loss function policy_loss_fn_args: # Policy loss function arguments epsilon_low: 0.2 epsilon_high: 0.2 clip_mode: "one-side" weight: "none" temp: 1.0 regularizer: "none" regularizer_coef: 0.0 kl_loss_fn: 'k2' # KL divergence loss function kl_loss_fn_args: kl_coef: 0.0 # KL divergence coefficient advantage_fn_args: std_cal_level: 'batch' # Advantage normalization level model: model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen3-4B-Instruct-2507} # Base model path max_response_tokens: 4096 # Max tokens per response max_model_len: 20480 # Max context length buffer: total_epochs: 10 # Total training epochs batch_size: 64 # Batch size per explore step train_batch_size: 2560 # 64*8*5, total experiences per training step trainer_input: experience_buffer: name: experience_buffer storage_type: queue replay_buffer: enable: true # Enable experience replay priority_fn: 'decay_limit_randomization' priority_fn_args: decay: 2.0 use_count_limit: 3 sigma: 2.0 explorer: eval_interval: 10 max_repeat_times_per_runner: 1 # Max repeat times per runner max_timeout: 3600 # Max timeout for each rollout (seconds) rollout_model: enable_history: true # Enable conversation history enable_openai_api: true # Enable OpenAI-compatible API enable_auto_tool_choice: true # Enable automatic tool selection tool_call_parser: hermes # Parser for tool calls engine_num: 4 # Number of vLLM engines for rollout model tensor_parallel_size: 1 # TP size per engine for rollout model enable_prefix_caching: false # Disable prefix caching auxiliary_models: - name: judge model_path: Qwen/Qwen3-30B-A3B-Instruct-2507 # Judge model path engine_num: 1 # Number of vLLM engines for judge model tensor_parallel_size: 2 # TP size per engine for judge model enable_thinking: false # Disable thinking/reasoning mode max_prompt_tokens: 2048 # Max tokens for prompt max_response_tokens: 128 # Max tokens for response max_model_len: 2500 # Max model context length synchronizer: sync_style: dynamic_by_explorer # Sync triggered dynamically by explorer sync_interval: 5 # Sync every N steps sync_timeout: 3600 # Timeout for synchronization (seconds) trainer: save_interval: 100 # Save checkpoint every N steps grad_clip: 1.0 # Gradient clipping value use_dynamic_bsz: true # Use dynamic batch size max_token_len_per_gpu: 16384 # Max token length per GPU ulysses_sequence_parallel_size: 1 # Sequence parallel size for Ulysses