Upload fine-tuned Danish ASR (hviske v5.1) — coral_read_aloud WER 19.5%, coral_conversation WER 25.5%

d382915 verified about 2 months ago

4.1 kB

	{
	"architectures": [
	"CohereAsrForConditionalGeneration"
	],
	"auto_map": {
	"AutoConfig": "configuration_cohere_asr.CohereAsrConfig",
	"AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor",
	"AutoModel": "modeling_cohere_asr.CohereAsrModel",
	"AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration",
	"AutoProcessor": "processing_cohere_asr.CohereAsrProcessor",
	"AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer"
	},
	"batch_size": 128,
	"decoding": {
	"beam": {
	"beam_size": 1,
	"len_pen": 0.0,
	"max_generation_delta": 50
	},
	"return_best_hypothesis": true,
	"strategy": "beam"
	},
	"dtype": "bfloat16",
	"encoder": {
	"att_context_size": [
	-1,
	-1
	],
	"causal_downsampling": false,
	"conv_context_size": null,
	"conv_kernel_size": 9,
	"conv_norm_type": "batch_norm",
	"d_model": 1280,
	"dropout": 0,
	"dropout_att": 0,
	"dropout_emb": 0,
	"dropout_pre_encoder": 0,
	"feat_in": 128,
	"feat_out": -1,
	"ff_expansion_factor": 4,
	"n_heads": 8,
	"n_layers": 48,
	"pos_emb_max_len": 5000,
	"reduction": null,
	"reduction_factor": 1,
	"reduction_position": null,
	"self_attention_model": "rel_pos",
	"subsampling": "dw_striding",
	"subsampling_conv_channels": 256,
	"subsampling_factor": 8,
	"untie_biases": true,
	"xscaling": false
	},
	"head": {
	"activation": "relu",
	"dropout": 0,
	"hidden_size": 1024,
	"log_softmax": true,
	"num_classes": 16384,
	"num_layers": 1,
	"use_transformer_init": true
	},
	"is_encoder_decoder": true,
	"log_batch_stats": false,
	"log_prediction": true,
	"max_audio_clip_s": 35,
	"max_seq_len": 1024,
	"min_energy_window_samples": 1600,
	"model_defaults": {
	"asr_enc_hidden": 1280,
	"lm_dec_hidden": 1024,
	"lm_enc_hidden": 1024
	},
	"model_type": "cohere_asr",
	"multitask_metrics_cfg": {
	"log_predictions": true,
	"metrics": {
	"wer": {
	"constraint": ".source_lang==.target_lang"
	}
	}
	},
	"overlap_chunk_second": 5,
	"preprocessor": {
	"dither": 1e-05,
	"features": 128,
	"frame_splicing": 1,
	"log": true,
	"n_fft": 512,
	"normalize": "per_feature",
	"pad_to": 0,
	"pad_value": 0.0,
	"sample_rate": 16000,
	"window": "hann",
	"window_size": 0.025,
	"window_stride": 0.01
	},
	"prompt_defaults": [
	{
	"role": "user",
	"slots": {
	"decodercontext": "",
	"diarize": "<\|nodiarize\|>",
	"emotion": "<\|emo:undefined\|>",
	"itn": "<\|noitn\|>",
	"pnc": "<\|pnc\|>",
	"source_lang": "<\|en\|>",
	"target_lang": "<\|en\|>",
	"timestamp": "<\|notimestamp\|>"
	}
	},
	{
	"role": "user_partial",
	"slots": {
	"decodercontext": ""
	}
	}
	],
	"prompt_format": "cohere_asr",
	"sample_rate": 16000,
	"supported_languages": [
	"en",
	"fr",
	"de",
	"es",
	"it",
	"pt",
	"nl",
	"pl",
	"el",
	"ar",
	"ja",
	"zh",
	"vi",
	"ko",
	"da"
	],
	"transf_decoder": {
	"config_dict": {
	"attn_layer_dropout": 0,
	"attn_score_dropout": 0,
	"embedding_dropout": 0,
	"ffn_dropout": 0,
	"hidden_act": "relu",
	"hidden_size": 1024,
	"inner_size": 4096,
	"learn_positional_encodings": false,
	"lm_dec_hidden": 1280,
	"max_sequence_length": 1024,
	"num_attention_heads": 8,
	"num_layers": 8,
	"num_token_types": 0,
	"pre_ln": true,
	"vocab_size": "None"
	},
	"encoder": null,
	"model_name": null,
	"pre_ln_final_layer_norm": true,
	"pretrained": false
	},
	"transf_encoder": {
	"attn_layer_dropout": 0,
	"attn_score_dropout": 0,
	"ffn_dropout": 0,
	"hidden_size": 1024,
	"inner_size": 4096,
	"mask_future": false,
	"num_attention_heads": 8,
	"num_layers": 0,
	"pre_ln": true,
	"pre_ln_final_layer_norm": true
	},
	"transformers_version": "4.57.6",
	"use_loss_mask_for_prompt": false,
	"vocab_size": 16384
	}