File size: 4,791 Bytes
8412ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fecc0f2
 
8412ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fecc0f2
 
8412ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fecc0f2
8412ad5
 
 
 
 
 
 
 
 
 
 
 
fecc0f2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
load_config_path: null
model:
  model_type: Gr00tN1d6
  model_dtype: bfloat16
  model_name: nvidia/Eagle-Block2A-2B-v2
  backbone_model_type: eagle
  model_revision: null
  tune_top_llm_layers: 4
  backbone_embedding_dim: 2048
  tune_llm: false
  tune_visual: false
  select_layer: 16
  reproject_vision: false
  use_flash_attention: true
  load_bf16: false
  collator_overwrite_image_inputs: false
  eagle_collator: true
  backbone_trainable_params_fp32: true
  image_crop_size: null
  image_target_size: null
  shortest_image_edge: 256
  crop_fraction: 0.95
  random_rotation_angle: null
  color_jitter_params: null
  use_albumentations_transforms: true
  formalize_language: true
  apply_sincos_state_encoding: false
  use_relative_action: true
  max_state_dim: 29
  max_action_dim: 29
  action_horizon: 16
  hidden_size: 1024
  input_embedding_dim: 1536
  add_pos_embed: true
  attn_dropout: 0.2
  use_vlln: true
  max_seq_len: 1024
  use_alternate_vl_dit: true
  attend_text_every_n_blocks: 2
  diffusion_model_cfg:
    positional_embeddings: null
    num_layers: 32
    num_attention_heads: 32
    attention_head_dim: 48
    norm_type: ada_norm
    dropout: 0.2
    final_dropout: true
    output_dim: 1024
    interleave_self_attention: true
  num_inference_timesteps: 4
  noise_beta_alpha: 1.5
  noise_beta_beta: 1.0
  noise_s: 0.999
  num_timestep_buckets: 1000
  tune_projector: true
  tune_diffusion_model: true
  tune_vlln: true
  state_dropout_prob: 0.0
  state_additive_noise_scale: 0.0
  max_num_embodiments: 32
data:
  datasets:
  - dataset_paths:
    - ./datasets/reachy2_100
    embodiment_tag: reachy2
    mix_ratio: 1.0
    dataset_type: physical_embodiment
    val_dataset_path: null
  modality_configs:
    reachy2:
      video:
        delta_indices:
        - 0
        modality_keys:
        - front_cam
        sin_cos_embedding_keys: null
        mean_std_embedding_keys: null
        action_configs: null
      state:
        delta_indices:
        - 0
        modality_keys:
        - arm_joints
        sin_cos_embedding_keys: null
        mean_std_embedding_keys: null
        action_configs: null
      action:
        delta_indices:
        - 0
        - 1
        - 2
        - 3
        - 4
        - 5
        - 6
        - 7
        - 8
        - 9
        - 10
        - 11
        - 12
        - 13
        - 14
        - 15
        modality_keys:
        - arm_joints
        - gripper
        sin_cos_embedding_keys: null
        mean_std_embedding_keys: null
        action_configs:
        - rep: RELATIVE
          type: NON_EEF
          format: DEFAULT
          state_key: arm_joints
        - rep: ABSOLUTE
          type: NON_EEF
          format: DEFAULT
          state_key: null
      language:
        delta_indices:
        - 0
        modality_keys:
        - annotation.human.task_description
        sin_cos_embedding_keys: null
        mean_std_embedding_keys: null
        action_configs: null
  download_cache: false
  shard_size: 1024
  episode_sampling_rate: 0.1
  num_shards_per_epoch: 10000
  override_pretraining_statistics: false
  mode: single_turn
  random_chop: 0.0
  mock_dataset_mode: false
  shuffle: true
  seed: 42
  multiprocessing_context: fork
  allow_padding: false
  subsample_ratio: 1.0
  image_crop_size:
  - 244
  - 244
  image_target_size:
  - 224
  - 224
  video_backend: decord
training:
  output_dir: /tmp/groot_output/reachy2
  experiment_name: null
  max_steps: 30000
  global_batch_size: 64
  batch_size: null
  gradient_accumulation_steps: 1
  learning_rate: 0.0001
  lr_scheduler_type: cosine
  weight_decay: 1.0e-05
  warmup_ratio: 0.05
  warmup_steps: 0
  max_grad_norm: 1.0
  optim: adamw_torch
  start_from_checkpoint: nvidia/GR00T-N1.6-3B
  tf32: true
  fp16: false
  bf16: true
  eval_bf16: true
  logging_steps: 10
  save_steps: 3000
  save_total_limit: 5
  save_vl_model: false
  upload_checkpoints: false
  upload_every: 1000
  upload_last_n_checkpoints: 5
  max_concurrent_uploads: 2
  eval_strategy: 'no'
  eval_steps: 500
  eval_set_split_ratio: 0.1
  eval_batch_size: 2
  save_best_eval_metric_name: ''
  save_best_eval_metric_greater_is_better: true
  deepspeed_stage: 2
  gradient_checkpointing: false
  transformers_trust_remote_code: true
  transformers_local_files_only: false
  transformers_cache_dir: null
  transformers_access_token: null
  use_ddp: false
  ddp_bucket_cap_mb: 100
  num_gpus: 1
  dataloader_num_workers: 8
  remove_unused_columns: false
  use_wandb: false
  wandb_project: finetune-gr00t-n1d6
  enable_profiling: false
  max_retries: 3
  assert_loss_less_than: null
  add_rl_callback: false
  enable_open_loop_eval: false
  open_loop_eval_traj_ids:
  - 0
  open_loop_eval_steps_per_traj: 100
  open_loop_eval_plot_indices: null
max_steps: 30000
save_steps: 3000