ds_cfg: train_micro_batch_size_per_gpu: ${per_gpu_train_batch_size} gradient_accumulation_steps: ${gradient_accumulation_steps} scheduler: type: WarmupCosineLR params: total_num_steps: null warmup_num_steps: null warmup_type: linear optimizer: type: AdamW params: lr: ${learning_rate} betas: - 0.9 - 0.95 eps: 1.0e-06 weight_decay: ${weight_decay} bf16: enabled: false zero_optimization: stage: 1 offload_optimizer: device: cpu pin_memory: true stage3_param_persistence_threshold: 100000.0 stage3_max_live_parameters: 100000000.0 stage3_prefetch_bucket_size: 100000000.0 memory_efficient_linear: false steps_per_print: 1 gradient_clipping: 1.0 prescale_gradients: false fp16: enabled: true auto_cast: false loss_scale: 0 initial_scale_power: 16 loss_scale_window: 1000 hysteresis: 2 consecutive_hysteresis: false min_loss_scale: 1 data_path_prefix: null model_path_prefix: /home/aiops/yangph/checkpoints/glide/checkpoint/experiments/ output_path_prefix: /home/aiops/yangph/checkpoints/glide/checkpoint/ train_file: /home/aiops/yangph/data/raw_data/sailor2_sft_data_1101/sailor2_sft_1119_400k_stage2_only1turn.jsonl dev_file: null test_file: null torch_dtype: _target_: general_util.training_utils.return_torch_dtype dtype: bfloat16 eos_token: <|endoftext|> eos_token_id: 151643 tokenizer_init: _target_: general_util.tokenization_utils.init_tokenizer tokenizer_path: ${model_name_or_path} padding_side: right pad_token: ${eos_token} device_map: _target_: models.utils.return_single_device_map model: _target_: models.qwen2_glide_du_2.Qwen2Glide.from_pretrained gradient_checkpointing: false attn_implementation: flash_attention_2 ignore_mismatched_sizes: true torch_dtype: ${torch_dtype} pad_token_id: ${eos_token_id} read_tensor: _target_: data.combine_dataset.MultiMappingDataset read_fn: _target_: data.input_utils.jsonl_read_fn aligner: _target_: data.input_aligner.add_id_aligner id_field: id template: _target_: data.input_utils.recompose_template units: chat_prefix: '' question: '{user}' answer: '{assistant}' chat_suffix: ${eos_token} compositions: prompt: '{question}' chosen: '{answer}{chat_suffix}' instruction: '' index_field: id kv_mapping: chosen: chosen id: index prompt: prompt dist_load_data_barrier: false extended_vocab: null collator: _target_: data.general_collator.SFTCollator tokenizer: ${tokenizer_init} max_seq_length: 4096 num_workers: 8 prefetch_factor: 2 model_name: sailor2_8B_sft_500step_hf_1102_longxu.sailor2_5b.glide.A100.tp1dp1.zero1.v1.0.8gpu_du_2.s42/checkpoint-last model_name_or_path: ${model_path_prefix}/${model_name}/ pretrain: null resume: latest dp_size: 8 tp_size: 1 pp_size: 1 wandb_project: sailor2 exp_name: sailor2_8b_sft.glide.A100.tp1dp1.zero1.v1.0.8gpu_du_2.5e-6.s${seed} exp_notes: null output_dir: ${output_path_prefix}experiments/${exp_name} do_train: true evaluate_during_training: false do_eval: false eval_sub_path: checkpoint-* per_gpu_train_batch_size: 2 per_gpu_eval_batch_size: 2 learning_rate: 5.0e-06 gradient_accumulation_steps: 64 weight_decay: 0.1 adam_epsilon: 1.0e-06 adam_betas: (0.9, 0.98) total_dataset_len: -1 max_grad_norm: 1.0 num_train_epochs: 2 max_steps: 0 warmup_proportion: 0.1 warmup_steps: 0 optimizer: null use_nvlamb: null bit_training: null logging_steps: 1 save_ds_state: true save_steps: 200 save_best: false eval_steps: 200 ddp_eval: true no_cuda: false seed: 42 local_rank: 0 fp16: true fp16_opt_level: O1 fp16_bfloat16: true prediction_cfg: metric: loss measure: -1 best_checkpoint: null best_result: null eval_forward_fn: _target_: general_util.evaluator.DefaultForwardFn post_process: _target_: post_processors.dpo.SFTLossOnlyPostProcessor summary_helper: _target_: general_util.tensorboard_helper.WandbWriter batch_index_or_keys: null outputs_index_or_keys: train/loss: loss train/large_loss: llm_loss n_gpu: 1 device: cuda:0 train_batch_size: null eval_batch_size: null world_size: 8