Instructions to use nyu-visionx/RAE-dinov2-wReg-base-ViTXL-n08 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Diffusers
How to use nyu-visionx/RAE-dinov2-wReg-base-ViTXL-n08 with Diffusers:
pip install -U diffusers transformers accelerate
import torch from diffusers import DiffusionPipeline # switch to "mps" for apple devices pipe = DiffusionPipeline.from_pretrained("nyu-visionx/RAE-dinov2-wReg-base-ViTXL-n08", dtype=torch.bfloat16, device_map="cuda") prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" image = pipe(prompt).images[0] - Notebooks
- Google Colab
- Kaggle
Update config for diffusers AutoencoderRAE refactor
#3
by kashif HF Staff - opened
- config.json +15 -4
- conversion_metadata.json +0 -12
config.json
CHANGED
|
@@ -5,9 +5,7 @@
|
|
| 5 |
"decoder_intermediate_size": 4096,
|
| 6 |
"decoder_num_attention_heads": 16,
|
| 7 |
"decoder_num_hidden_layers": 28,
|
| 8 |
-
"encoder_cls": "dinov2",
|
| 9 |
"encoder_input_size": 224,
|
| 10 |
-
"encoder_name_or_path": "facebook/dinov2-with-registers-base",
|
| 11 |
"image_size": null,
|
| 12 |
"latents_mean": null,
|
| 13 |
"latents_std": [
|
|
@@ -222737,5 +222735,18 @@
|
|
| 222737 |
"patch_size": 16,
|
| 222738 |
"reshape_to_2d": true,
|
| 222739 |
"scaling_factor": 1.0,
|
| 222740 |
-
"use_encoder_loss": false
|
| 222741 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"decoder_intermediate_size": 4096,
|
| 6 |
"decoder_num_attention_heads": 16,
|
| 7 |
"decoder_num_hidden_layers": 28,
|
|
|
|
| 8 |
"encoder_input_size": 224,
|
|
|
|
| 9 |
"image_size": null,
|
| 10 |
"latents_mean": null,
|
| 11 |
"latents_std": [
|
|
|
|
| 222735 |
"patch_size": 16,
|
| 222736 |
"reshape_to_2d": true,
|
| 222737 |
"scaling_factor": 1.0,
|
| 222738 |
+
"use_encoder_loss": false,
|
| 222739 |
+
"encoder_type": "dinov2",
|
| 222740 |
+
"encoder_hidden_size": 768,
|
| 222741 |
+
"encoder_patch_size": 14,
|
| 222742 |
+
"encoder_norm_mean": [
|
| 222743 |
+
0.485,
|
| 222744 |
+
0.456,
|
| 222745 |
+
0.406
|
| 222746 |
+
],
|
| 222747 |
+
"encoder_norm_std": [
|
| 222748 |
+
0.229,
|
| 222749 |
+
0.224,
|
| 222750 |
+
0.225
|
| 222751 |
+
]
|
| 222752 |
+
}
|
conversion_metadata.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"source": "/data/boyang/models",
|
| 3 |
-
"encoder_cls": "dinov2",
|
| 4 |
-
"encoder_name_or_path": "facebook/dinov2-with-registers-base",
|
| 5 |
-
"decoder_checkpoint": "decoders/dinov2/wReg_base/ViTXL_n08/model.pt",
|
| 6 |
-
"stats_checkpoint": "stats/dinov2/wReg_base/imagenet1k/stat.pt",
|
| 7 |
-
"variant": "ViTXL_n08",
|
| 8 |
-
"dataset_name": "imagenet1k",
|
| 9 |
-
"decoder_config_name": "ViTXL",
|
| 10 |
-
"missing_decoder_keys": [],
|
| 11 |
-
"unexpected_decoder_keys": []
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|