framework:
  name: LlamaOFT
  llamavl:
    base_vlm: /datasets/llama3_2-11B-Vision
    attn_implementation: sdpa
    vl_hidden_dim: 4096
  qwenvl:
    base_vlm: Llama-3.2-11B-Vision-Instruct
  dino:
    dino_backbone: dinov2_vits14
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 4096
    hidden_size: 4096
    add_pos_embed: true
    max_seq_len: 1024
    action_dim: 7
    state_dim: 7
    future_action_window_size: 7
    action_horizon: 8
    past_action_window_size: 0
    repeated_diffusion_steps: 8
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    diffusion_model_cfg:
      cross_attention_dim: 4096
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 2560
      positional_embeddings: null
  reduce_in_full_precision: true
datasets:
  vla_data:
    dataset_py: lerobot_datasets
    dataloader_module: lerobot_datasets
    data_root_dir: /share/weiyu/IPEC-COMMUNITY
    data_mix: libero_goal
    action_type: delta_qpos
    sequential_step_sampling: false
    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
      Locate their bounding boxes in [x1,y1,x2,y2] format.
    CoT_answer: bbox
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 4
    load_all_data_for_training: true
    obs:
    - image_0
    video_backend: pyav
    include_state: true
    dataset_mix: libero_all
trainer:
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
  epochs: 100
  eval_interval: 200001
  freeze_modules: llama_vl_interface.model.language_model
  gradient_accumulation_steps: 8
  gradient_clipping: 1.0
  is_resume: false
  learning_rate:
    action_model: 0.0008
    base: 0.00024
    qwen_vl_interface: 1.0e-05
    llama_vl_interface: 8.0e-05
  logging_frequency: 10
  loss_scale:
    vla: 1.0
    vlm: 0.1
  lr_scheduler_type: cosine_with_min_lr
  max_grad_norm: 1.0
  max_train_steps: 1200000
  num_warmup_steps: 5000
  optimizer:
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    name: AdamW
    weight_decay: 1.0e-08
  resume_epoch: null
  resume_step: null
  save_interval: 80000
  scheduler_specific_kwargs:
    min_lr: 1.0e-06
  warmup_ratio: 0.1
  weight_decay: 0.0
environment:
  wandb_mode: online
  wandb_project: vla-engine-benchmark
  wandb_entity: ''
  wandb_base_url: https://api.bandw.top
  num_gpus: 2
  main_process_port: 29500
  nccl:
    ib_hca: mlx5_2,mlx5_3
    blocking_wait: 1
    async_error_handling: 1
    timeout: 10000
    socket_timeout_ms: 360000
seed: 42
run_id: llama_oft_all_150k
output_root_dir: ./results/training
output_dir: ./results/training/llama_oft_all_150k