{ "model_type": "sortformer", "num_speakers": 4, "ats_weight": 0.5, "pil_weight": 0.5, "dtype": "float16", "fc_encoder_config": { "model_type": "sortformer_fc_encoder", "hidden_size": 512, "num_hidden_layers": 17, "num_attention_heads": 8, "num_key_value_heads": 8, "intermediate_size": 2048, "hidden_act": "silu", "num_mel_bins": 128, "conv_kernel_size": 9, "subsampling_factor": 8, "subsampling_conv_channels": 256, "subsampling_conv_kernel_size": 3, "subsampling_conv_stride": 2, "max_position_embeddings": 5000, "attention_bias": true, "scale_input": true }, "tf_encoder_config": { "model_type": "sortformer_tf_encoder", "d_model": 192, "encoder_layers": 18, "encoder_attention_heads": 8, "encoder_ffn_dim": 768, "activation_function": "relu", "max_source_positions": 1500, "k_proj_bias": true }, "modules_config": { "model_type": "sortformer_modules", "num_speakers": 4, "fc_d_model": 512, "tf_d_model": 192, "subsampling_factor": 8, "chunk_len": 188, "fifo_len": 0, "spkcache_len": 188, "spkcache_update_period": 188, "chunk_left_context": 1, "chunk_right_context": 1, "spkcache_sil_frames_per_spk": 3, "causal_attn_rc": 7, "scores_boost_latest": 0.05, "sil_threshold": 0.2, "pred_score_threshold": 0.25, "strong_boost_rate": 0.75, "weak_boost_rate": 1.5, "min_pos_scores_rate": 0.5, "max_index": 99999, "use_aosc": true }, "processor_config": { "feature_size": 128, "sampling_rate": 16000, "hop_length": 160, "n_fft": 512, "win_length": 400, "preemphasis": 0.97 } }