| _type: GroundingDINOConfig | |
| decoder_cfg: | |
| _type: DecoderConfig | |
| layer_cfg: | |
| _type: DecoderLayerConfig | |
| attention_dropout: 0 | |
| cls_emb_type: mm_grounding_dino | |
| dropout: 0 | |
| emb_dim: 256 | |
| ffn_dim: 2048 | |
| num_heads: 8 | |
| num_levels: 4 | |
| num_points: 4 | |
| use_fusion: true | |
| use_legacy_pos: false | |
| num_layers: 6 | |
| share_bbox_head: false | |
| share_cls_head: false | |
| share_norm: false | |
| share_pos_head: false | |
| encoder_cfg: | |
| _type: CrossScaleEncoderConfig | |
| layer_cfg: | |
| _type: CrossScaleEncoderLayerConfig | |
| attention_dropout: 0.0 | |
| dropout: 0.0 | |
| droppath: 0.0 | |
| emb_dim: 256 | |
| emb_dim_fusion: 256 | |
| ffn_dim: 2048 | |
| ffn_dim_txt: 1024 | |
| num_heads: 8 | |
| num_heads_fusion: 4 | |
| num_heads_txt: 4 | |
| num_levels: 4 | |
| num_points: 4 | |
| use_fusion: true | |
| use_txt_self_att: false | |
| num_layers: 6 | |
| img_backbone_cfg: | |
| _type: ImageBackboneConfig | |
| backbone_kwargs: | |
| out_features: | |
| - stage2 | |
| - stage3 | |
| - stage4 | |
| cfg_str: microsoft/swin-tiny-patch4-window7-224 | |
| emb_dim: 256 | |
| num_extra_down_projs: 1 | |
| num_extra_up_projs: 0 | |
| provider: transformers_auto | |
| use_legacy_pos: false | |
| use_pretrained: false | |
| img_mean: | |
| - 0.485 | |
| - 0.456 | |
| - 0.406 | |
| img_std: | |
| - 0.229 | |
| - 0.224 | |
| - 0.225 | |
| query_selector_cfg: | |
| _type: QuerySelectorConfig | |
| cls_emb_type: mm_grounding_dino | |
| emb_dim: 256 | |
| num_queries: 900 | |
| txt_backbone_cfg: | |
| _type: TextBackboneConfig | |
| backbone_kwargs: {} | |
| cfg_str: bert-base-uncased | |
| emb_dim: 256 | |
| provider: transformers_auto | |
| use_pooling: true | |
| use_pretrained: false | |