{ "model_type": "mhubert_ipa_ctc_ft", "architectures": [ "MHuBERTIPACTCFTModel" ], "auto_map": { "AutoConfig": "configuration_mhubert_ipa_ctc_ft.MHuBERTIPACTCFTConfig", "AutoModel": "modeling_mhubert_ipa_ctc_ft.MHuBERTIPACTCFTModel" }, "base_model": "utter-project/mHuBERT-147", "backbone_id": "utter-project/mHuBERT-147", "task": "end-to-end CTC IPA phone recognition", "label_type": "IPA", "label_map": "ipa_map.json", "ctc_blank_token": "", "blank_position": "last", "architecture": { "input_dim": 768, "proj_dim": 256, "lstm_hidden": 256, "lstm_layers": 2, "lstm_bidirectional": true, "dropout": 0.3, "n_phones": 45, "output_dim": 46, "blank_id": 45 }, "evaluation": { "timit_test_per": 0.0896, "buckeye_val_per": 0.1987, "metric": "PER (Phone Error Rate)" }, "backbone_config": { "activation_dropout": 0.1, "apply_spec_augment": true, "attention_dropout": 0.1, "classifier_proj_size": 256, "conv_bias": false, "conv_dim": [ 512, 512, 512, 512, 512, 512, 512 ], "conv_kernel": [ 10, 3, 3, 3, 3, 2, 2 ], "conv_stride": [ 5, 2, 2, 2, 2, 2, 2 ], "do_stable_layer_norm": false, "feat_extract_activation": "gelu", "feat_extract_dropout": 0.0, "feat_extract_norm": "group", "feat_proj_dropout": 0.1, "feat_proj_layer_norm": true, "final_dropout": 0.1, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout": 0.1, "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "layerdrop": 0.1, "mask_feature_length": 10, "mask_feature_min_masks": 0, "mask_feature_prob": 0.0, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_time_prob": 0.05, "model_type": "hubert", "num_attention_heads": 12, "num_conv_pos_embedding_groups": 16, "num_conv_pos_embeddings": 128, "num_feat_extract_layers": 7, "num_hidden_layers": 12, "torch_dtype": "float32", "use_weighted_layer_sum": false, "vocab_size": 32 } }