{ "model_type": "omnilingual_asr_ctc", "model_size": "3B", "variant": "ctc", "sample_rate": 16000, "frame_rate": 50, "max_audio_seconds": 40, "feature_extractor": { "kind": "wav2vec2_conv", "feature_dim": 512, "layers": [ { "channels": 512, "kernel": 10, "stride": 5 }, { "channels": 512, "kernel": 3, "stride": 2 }, { "channels": 512, "kernel": 3, "stride": 2 }, { "channels": 512, "kernel": 3, "stride": 2 }, { "channels": 512, "kernel": 3, "stride": 2 }, { "channels": 512, "kernel": 2, "stride": 2 }, { "channels": 512, "kernel": 2, "stride": 2 } ], "normalize_audio": true }, "pos_encoder": { "kind": "wav2vec2_conv", "kernel_size": 128, "num_groups": 16, "weight_norm": true }, "encoder": { "kind": "standard_transformer", "norm_order": "pre", "num_layers": 60, "model_dim": 2048, "num_heads": 32, "ffn_dim": 8192 }, "ctc_head": { "vocab_size": 10288 }, "tokenizer": { "kind": "sentencepiece", "file": "tokenizer.model", "bos_idx": 0, "pad_idx": 1, "eos_idx": 2, "unk_idx": 3 }, "quantization": { "bits": 4, "group_size": 64, "method": "minmax_per_group", "targets": [ "self_attn.q_proj.weight", "self_attn.k_proj.weight", "self_attn.v_proj.weight", "self_attn.output_proj.weight", "ffn.inner_proj.weight", "ffn.output_proj.weight", "final_proj.weight" ] } }