project: AgentScope-ReAct name: Learn_to_Ask-Qwen2.5-7B-fixed # directory to save checkpoints, default to ./checkpoints if TRINITY_CHECKPOINT_ROOT_DIR not set checkpoint_root_dir: ${oc.env:TRINITY_CHECKPOINT_ROOT_DIR,./checkpoints} algorithm: algorithm_type: grpo # a GRPO-based algorithm for multi-step reasoning model: # path to the pre-trained model, default to Qwen/Qwen2.5-7B-Instruct if TRINITY_MODEL_PATH not set model_path: ${oc.env:TRINITY_MODEL_PATH,Qwen/Qwen2.5-7B-Instruct} tinker: # tinker config, you can set tinker parameters here enable: false # if true, tinker will be enabled cluster: node_num: 1 # cluster with 1 node gpu_per_node: 8 # each node has 8 GPUs buffer: total_epochs: 4 # run taskset for 4 epoch explorer: runner_per_model: 32 # each model has 32 runners for parallel rollout max_timeout: 600 # max timeout for each rollout is 600 seconds synchronizer: sync_style: fixed sync_method: 'nccl' sync_interval: 10 sync_timeout: 7200 # wait for 120 minutes trainer: save_interval: 90 # save checkpoint every 90 steps use_dynamic_bsz: true ulysses_sequence_parallel_size: 1 # use sequence parallelism to reduce memory usage monitor: monitor_type: wandb # here we use wandb; you can also use tensorboard, mlflow or swanlab