framework: name: LlamaOFT llamavl: base_vlm: /datasets/llama3_2-11B-Vision attn_implementation: sdpa vl_hidden_dim: 4096 qwenvl: base_vlm: Llama-3.2-11B-Vision-Instruct dino: dino_backbone: dinov2_vits14 action_model: action_model_type: DiT-B action_hidden_dim: 4096 hidden_size: 4096 add_pos_embed: true max_seq_len: 1024 action_dim: 7 state_dim: 7 future_action_window_size: 7 action_horizon: 8 past_action_window_size: 0 repeated_diffusion_steps: 8 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_timestep_buckets: 1000 num_inference_timesteps: 4 num_target_vision_tokens: 32 diffusion_model_cfg: cross_attention_dim: 4096 dropout: 0.2 final_dropout: true interleave_self_attention: true norm_type: ada_norm num_layers: 16 output_dim: 2560 positional_embeddings: null reduce_in_full_precision: true datasets: vla_data: dataset_py: lerobot_datasets dataloader_module: lerobot_datasets data_root_dir: /share/weiyu/IPEC-COMMUNITY data_mix: libero_goal action_type: delta_qpos sequential_step_sampling: false CoT_prompt: Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format. CoT_answer: bbox default_image_resolution: - 3 - 224 - 224 per_device_batch_size: 4 load_all_data_for_training: true obs: - image_0 video_backend: pyav include_state: true dataset_mix: libero_all trainer: enable_gradient_checkpointing: true enable_mixed_precision_training: true epochs: 100 eval_interval: 200001 freeze_modules: llama_vl_interface.model.language_model gradient_accumulation_steps: 8 gradient_clipping: 1.0 is_resume: false learning_rate: action_model: 0.0008 base: 0.00024 qwen_vl_interface: 1.0e-05 llama_vl_interface: 8.0e-05 logging_frequency: 10 loss_scale: vla: 1.0 vlm: 0.1 lr_scheduler_type: cosine_with_min_lr max_grad_norm: 1.0 max_train_steps: 1200000 num_warmup_steps: 5000 optimizer: betas: - 0.9 - 0.95 eps: 1.0e-08 name: AdamW weight_decay: 1.0e-08 resume_epoch: null resume_step: null save_interval: 80000 scheduler_specific_kwargs: min_lr: 1.0e-06 warmup_ratio: 0.1 weight_decay: 0.0 environment: wandb_mode: online wandb_project: vla-engine-benchmark wandb_entity: '' wandb_base_url: https://api.bandw.top num_gpus: 2 main_process_port: 29500 nccl: ib_hca: mlx5_2,mlx5_3 blocking_wait: 1 async_error_handling: 1 timeout: 10000 socket_timeout_ms: 360000 seed: 42 run_id: llama_oft_all_150k output_root_dir: ./results/training output_dir: ./results/training/llama_oft_all_150k