_type: GroundingDINOConfig decoder_cfg: _type: DecoderConfig layer_cfg: _type: DecoderLayerConfig attention_dropout: 0 cls_emb_type: mm_grounding_dino dropout: 0 emb_dim: 256 ffn_dim: 2048 num_heads: 8 num_levels: 4 num_points: 4 use_fusion: true use_legacy_pos: false num_layers: 6 share_bbox_head: false share_cls_head: false share_norm: false share_pos_head: false encoder_cfg: _type: CrossScaleEncoderConfig layer_cfg: _type: CrossScaleEncoderLayerConfig attention_dropout: 0.0 dropout: 0.0 droppath: 0.0 emb_dim: 256 emb_dim_fusion: 256 ffn_dim: 2048 ffn_dim_txt: 1024 num_heads: 8 num_heads_fusion: 4 num_heads_txt: 4 num_levels: 4 num_points: 4 use_fusion: true use_txt_self_att: false num_layers: 6 img_backbone_cfg: _type: ImageBackboneConfig backbone_kwargs: out_features: - stage2 - stage3 - stage4 cfg_str: microsoft/swin-tiny-patch4-window7-224 emb_dim: 256 num_extra_down_projs: 1 num_extra_up_projs: 0 provider: transformers_auto use_legacy_pos: false use_pretrained: false img_mean: - 0.485 - 0.456 - 0.406 img_std: - 0.229 - 0.224 - 0.225 query_selector_cfg: _type: QuerySelectorConfig cls_emb_type: mm_grounding_dino emb_dim: 256 num_queries: 900 txt_backbone_cfg: _type: TextBackboneConfig backbone_kwargs: {} cfg_str: bert-base-uncased emb_dim: 256 provider: transformers_auto use_pooling: true use_pretrained: false