stage_1: target: stage1.RAE params: encoder_cls: Dinov2withNorm encoder_config_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/huggingface/hub/models--facebook--dinov2-with-registers-base/snapshots/a1d738ccfa7ae170945f210395d99dde8adb1805 encoder_input_size: 224 encoder_params: dinov2_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/huggingface/hub/models--facebook--dinov2-with-registers-base/snapshots/a1d738ccfa7ae170945f210395d99dde8adb1805 normalize: true decoder_config_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/LightningDiT/RAE/configs/decoder/ViTXL pretrained_decoder_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/LightningDiT/RAE/models/decoders/dinov2/wReg_base/ViTXL_n08/model.pt noise_tau: 0 reshape_to_2d: true normalization_stat_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/LightningDiT/RAE/models/stats/dinov2/wReg_base/imagenet1k/stat.pt stage_2: target: stage2.models.lightningDiT.LightningDiT params: input_size: 16 patch_size: 1 in_channels: 768 hidden_size: 1152 depth: 28 num_heads: 16 mlp_ratio: 4.0 class_dropout_prob: 0.1 num_classes: 1000 use_qknorm: true use_swiglu: true use_rope: true use_rmsnorm: true wo_shift: false use_gembed: true transport: params: path_type: Linear prediction: velocity loss_weight: null time_dist_type: logit-normal_0_1 sampler: mode: ODE params: sampling_method: euler num_steps: 50 atol: 1.0e-06 rtol: 0.001 reverse: false guidance: method: cfg scale: 1.0 t_min: 0.0 t_max: 1.0 misc: latent_size: - 768 - 16 - 16 num_classes: 1000 time_dist_shift_dim: 196608 time_dist_shift_base: 4096 eval: eval_interval: 100000 eval_model: true data_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/data/ILSVRC2012/ILSVRC2012_validation/data reference_npz_path: /home/jovyan/liushanyuan-sh-ceph/project/sub_project/gongyue/LightningDiT/guided-diffusion/evaluations/VIRTUAL_imagenet256_labeled.npz training: epochs: 1400 global_batch_size: 1024 grad_accum_steps: 1 ema_decay: 0.9995 num_workers: 16 log_interval: 100 checkpoint_interval: 10 sample_every: 10000 clip_grad: 1.0 global_seed: 42 optimizer: lr: 0.0002 betas: - 0.9 - 0.95 weight_decay: 0.0 scheduler: type: linear warmup_epochs: 40 decay_end_epoch: 800 base_lr: 0.0002 final_lr: 2.0e-05 warmup_from_zero: false