{ "architectures": [ "MagentaRT2ForConditionalGeneration" ], "auto_map": { "AutoConfig": "configuration_magenta_rt2.MagentaRT2Config", "AutoModel": "modeling_magenta_rt2.MagentaRT2ForConditionalGeneration" }, "cfg_drums": 1.0, "cfg_musiccoca": 3.0, "cfg_notes": 1.0, "codebook_size": 1024, "codec_param_shapes": { "decoder_0__conv2d_3x3__conv__bias": [ 1024 ], "decoder_0__conv2d_3x3__conv__kernel": [ 3, 3, 1024, 1024 ], "decoder_0__conv2dtranspose_4x3__conv__bias": [ 1024 ], "decoder_0__conv2dtranspose_4x3__conv__kernel": [ 4, 3, 512, 1024 ], "decoder_0__shortcut_layer__conv1x1__conv__bias": [ 1024 ], "decoder_0__shortcut_layer__conv1x1__conv__kernel": [ 1, 1, 512, 1024 ], "decoder_1__conv2d_3x3__conv__bias": [ 256 ], "decoder_1__conv2d_3x3__conv__kernel": [ 3, 3, 256, 256 ], "decoder_1__conv2dtranspose_4x4__conv__bias": [ 256 ], "decoder_1__conv2dtranspose_4x4__conv__kernel": [ 4, 4, 512, 256 ], "decoder_1__shortcut_layer__conv1x1__conv__bias": [ 256 ], "decoder_1__shortcut_layer__conv1x1__conv__kernel": [ 1, 1, 512, 256 ], "decoder_2__conv2d_3x3__conv__bias": [ 256 ], "decoder_2__conv2d_3x3__conv__kernel": [ 3, 3, 256, 256 ], "decoder_2__conv2dtranspose_3x4__conv__bias": [ 256 ], "decoder_2__conv2dtranspose_3x4__conv__kernel": [ 3, 4, 256, 256 ], "decoder_3__conv2d_3x3__conv__bias": [ 256 ], "decoder_3__conv2d_3x3__conv__kernel": [ 3, 3, 256, 256 ], "decoder_3__conv2dtranspose_3x4__conv__bias": [ 256 ], "decoder_3__conv2dtranspose_3x4__conv__kernel": [ 3, 4, 256, 256 ], "decoder_4__conv2d_3x3__conv__bias": [ 128 ], "decoder_4__conv2d_3x3__conv__kernel": [ 3, 3, 128, 128 ], "decoder_4__conv2dtranspose_3x6__conv__bias": [ 128 ], "decoder_4__conv2dtranspose_3x6__conv__kernel": [ 3, 6, 256, 128 ], "decoder_4__shortcut_layer__conv1x1__conv__bias": [ 128 ], "decoder_4__shortcut_layer__conv1x1__conv__kernel": [ 1, 1, 256, 128 ], "decoder_5__conv2d_3x3__conv__bias": [ 128 ], "decoder_5__conv2d_3x3__conv__kernel": [ 3, 3, 128, 128 ], "decoder_5__conv2dtranspose_3x4__conv__bias": [ 128 ], "decoder_5__conv2dtranspose_3x4__conv__kernel": [ 3, 4, 128, 128 ], "decoder_6__conv2d_3x3__conv__bias": [ 64 ], "decoder_6__conv2d_3x3__conv__kernel": [ 3, 3, 64, 64 ], "decoder_6__conv2dtranspose_3x4__conv__bias": [ 64 ], "decoder_6__conv2dtranspose_3x4__conv__kernel": [ 3, 4, 128, 64 ], "decoder_6__shortcut_layer__conv1x1__conv__bias": [ 64 ], "decoder_6__shortcut_layer__conv1x1__conv__kernel": [ 1, 1, 128, 64 ], "input_layer__conv1x1_first__conv__bias": [ 2560 ], "input_layer__conv1x1_first__conv__kernel": [ 1, 1, 256, 2560 ], "input_layer__shortcut_layer__conv1x1_b1__conv__bias": [ 2560 ], "input_layer__shortcut_layer__conv1x1_b1__conv__kernel": [ 1, 1, 256, 2560 ], "input_layer__shortcut_layer__conv1x1_b2__conv__bias": [ 2560 ], "input_layer__shortcut_layer__conv1x1_b2__conv__kernel": [ 1, 1, 2560, 2560 ], "input_layers_residual_unit__conv2d_3x3__conv__bias": [ 512 ], "input_layers_residual_unit__conv2d_3x3__conv__kernel": [ 3, 3, 512, 512 ], "input_layers_residual_unit__conv2d_3x3_a__conv__bias": [ 512 ], "input_layers_residual_unit__conv2d_3x3_a__conv__kernel": [ 3, 3, 512, 512 ], "output_layer__base_conv_last__conv__bias": [ 2 ], "output_layer__base_conv_last__conv__kernel": [ 7, 7, 64, 2 ] }, "depth": [ 2, 768, 3072, 6, 128 ], "depth_max_past": 12, "dtype": "float32", "encoder_model_dims": 256, "frame_samples": 1920, "model_type": "magenta_rt2", "musiccoca_embed_dim": 768, "musiccoca_per_rvq_vocab": 1031, "musiccoca_rvq": 12, "num_codebooks": 12, "num_drums": 1, "num_notes": 128, "num_reserved_tokens": 6, "num_sinks": 1, "regular_num_channels": 132, "regular_num_embeddings_per_channel": [ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 9, 47, 47, 15 ], "sample_rate": 48000, "size": "mrt2_small", "soft_cap_logits": 30.0, "temperature": 1.3, "temporal": [ 12, 1024, 4096, 8, 128 ], "temporal_max_past": 41, "top_k": 40, "transformers_version": "5.8.0", "vocab_size": 12294 }