data: tokenizer: null use_shm: false train_files: /var/lib/condor/execute/slot1/dir_3823822/scratch/agentic-rllm/rllm/data/datasets/sudoku/train_verl.parquet val_files: /var/lib/condor/execute/slot1/dir_3823822/scratch/agentic-rllm/rllm/data/datasets/sudoku/test_verl.parquet prompt_key: prompt reward_fn_key: data_source max_prompt_length: 2000 max_response_length: 50 train_batch_size: 128 val_batch_size: 500 return_raw_input_ids: false return_raw_chat: false return_full_prompt: false shuffle: true filter_overlong_prompts: false filter_overlong_prompts_workers: 1 truncation: error image_key: images video_key: videos trust_remote_code: false custom_cls: path: null name: null actor_rollout_ref: hybrid_engine: true model: path: /var/lib/condor/execute/slot1/dir_3823822/scratch/model_ckpt use_shm: false external_lib: null override_config: {} enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: true lora_rank: 0 lora_alpha: 16 target_modules: all-linear use_liger: false use_fused_kernels: false trust_remote_code: false save_hf_repo_id: yurunyyr/agentic-sudoku-Markov-qwen2.5-3B_6-6_SFT-24k-30_prm0_actor tokenizer_chat_template: default actor: strategy: fsdp ppo_mini_batch_size: 128 ppo_num_mini_batches: 1 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: 32 use_dynamic_bsz: false use_dynamic_mini_batch: false ppo_max_token_len_per_gpu: 16384 grad_clip: 1.0 clip_ratio: 0.2 clip_ratio_low: 0.2 clip_ratio_high: 0.2 clip_ratio_c: 3.0 loss_agg_mode: seq-mean-token-mean entropy_coeff: 0 use_kl_loss: true use_torch_compile: true kl_loss_coef: 0.001 kl_loss_type: low_var_kl ppo_epochs: 1 shuffle: false ulysses_sequence_parallel_size: 1 checkpoint: contents: - model - optimizer - extra optim: lr: 1.0e-06 lr_warmup_steps: -1 lr_warmup_steps_ratio: 0.0 min_lr_ratio: null num_cycles: 0.5 warmup_style: constant total_training_steps: 55 weight_decay: 0.01 fsdp_config: wrap_policy: min_num_params: 0 param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true fsdp_size: -1 grad_norm_threshold: 10 ref: strategy: fsdp fsdp_config: param_offload: false reshard_after_forward: true wrap_policy: min_num_params: 0 use_torch_compile: true log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 32 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 ulysses_sequence_parallel_size: 1 rollout: name: vllm mode: async chat_scheduler: verl.schedulers.completions_scheduler.CompletionsScheduler chat_template: null temperature: 1.0 top_k: -1 top_p: 1 use_fire_sampling: false prompt_length: 2000 response_length: 50 dtype: bfloat16 gpu_memory_utilization: 0.5 ignore_eos: false enforce_eager: false free_cache_engine: false load_format: dummy_dtensor layered_summon: false tensor_model_parallel_size: 1 max_num_batched_tokens: 8192 max_model_len: null max_num_seqs: 1024 log_prob_micro_batch_size: null log_prob_micro_batch_size_per_gpu: 32 log_prob_use_dynamic_bsz: false log_prob_max_token_len_per_gpu: 16384 disable_log_stats: true enable_chunked_prefill: true do_sample: true 'n': 8 engine_kwargs: vllm: swap_space: null sglang: attention_backend: null val_kwargs: top_k: -1 top_p: 0.95 temperature: 0.6 'n': 1 do_sample: false multi_turn: enable: false max_turns: null tool_config_path: null format: chatml disable_logging: true critic: rollout_n: 8 strategy: fsdp optim: lr: 1.0e-05 lr_warmup_steps_ratio: 0.0 min_lr_ratio: null warmup_style: constant total_training_steps: 55 weight_decay: 0.01 model: path: ~/models/deepseek-llm-7b-chat use_shm: false tokenizer_path: /var/lib/condor/execute/slot1/dir_3823822/scratch/model_ckpt override_config: {} external_lib: null enable_gradient_checkpointing: true enable_activation_offload: false use_remove_padding: false trust_remote_code: false fsdp_config: param_offload: false optimizer_offload: false offload_policy: false reshard_after_forward: true wrap_policy: min_num_params: 0 fsdp_size: -1 lora_rank: 0 lora_alpha: 16 target_modules: all-linear save_hf_repo_id: null ppo_mini_batch_size: 128 ppo_micro_batch_size: null ppo_micro_batch_size_per_gpu: null forward_micro_batch_size: null forward_micro_batch_size_per_gpu: null use_dynamic_bsz: false ppo_max_token_len_per_gpu: 32768 forward_max_token_len_per_gpu: 32768 ulysses_sequence_parallel_size: 1 ppo_epochs: 1 shuffle: false grad_clip: 1.0 cliprange_value: 0.5 loss_agg_mode: seq-mean-token-mean checkpoint: contents: - model - optimizer - extra reward_model: enable: false strategy: fsdp model: input_tokenizer: /var/lib/condor/execute/slot1/dir_3823822/scratch/model_ckpt path: ~/models/FsfairX-LLaMA3-RM-v0.1 use_shm: false external_lib: null use_remove_padding: false use_fused_kernels: false trust_remote_code: false fsdp_config: wrap_policy: min_num_params: 0 param_offload: false reshard_after_forward: true fsdp_size: -1 micro_batch_size: null micro_batch_size_per_gpu: null max_length: null ulysses_sequence_parallel_size: 1 use_dynamic_bsz: false forward_max_token_len_per_gpu: 32768 reward_manager: naive launch_reward_fn_async: false sandbox_fusion: url: null max_concurrent: 64 custom_reward_function: path: null name: compute_score algorithm: gamma: 1.0 lam: 1.0 adv_estimator: grpo norm_adv_by_std_in_grpo: true use_kl_in_reward: false kl_penalty: kl kl_ctrl: type: fixed kl_coef: 0.001 horizon: 10000 target_kl: 0.1 use_pf_ppo: false pf_ppo: reweight_method: pow weight_pow: 2.0 mask_truncated_samples: false clip_advantages: false trainer: balance_batch: true total_epochs: 1000000000 total_training_steps: 55 project_name: rllm-agent experiment_name: agentic-sudoku-Markov-qwen2.5-3B_6-6_SFT-24k-30_prm0 logger: - console - wandb log_val_generations: 0 rollout_data_dir: null validation_data_dir: null nnodes: 1 n_gpus_per_node: 2 save_freq: 50 resume_mode: auto resume_from_path: null val_before_train: false test_freq: -1 critic_warmup: 0 default_hdfs_dir: null del_local_ckpt_after_load: false default_local_dir: checkpoints/rllm-agent/agentic-sudoku-Markov-qwen2.5-3B_6-6_SFT-24k-30_prm0 max_actor_ckpt_to_keep: 1 max_critic_ckpt_to_keep: null ray_wait_register_center_timeout: 300 rejection_sample: false rejection_sample_multiplier: 2 n_training_gpus_per_node: 4 hf_token: null resume_from_hf: enable: false actor_hf_repo_id: null actor_revision: main critic_hf_repo_id: null critic_revision: main hf_token: null ray_init: num_cpus: null env: name: custom env_args: {} NEG_REWARD: 0 process_reward: 0 env_type: action_validation_process_env agent: name: miniwobagent max_steps: 15 async_engine: true trajectory_timeout: null overlong_filter: false normalize_step_advantage: false use_stepwise_advantage: true stepwise_advantage_mode: mc_return agent_args: {} engine_args: disable_thinking: false gamma: 1 state_estimation: false mode: markovian