{
  "model_type": "omnilingual_asr_ctc",
  "model_size": "3B",
  "variant": "ctc",
  "sample_rate": 16000,
  "frame_rate": 50,
  "max_audio_seconds": 40,
  "feature_extractor": {
    "kind": "wav2vec2_conv",
    "feature_dim": 512,
    "layers": [
      {
        "channels": 512,
        "kernel": 10,
        "stride": 5
      },
      {
        "channels": 512,
        "kernel": 3,
        "stride": 2
      },
      {
        "channels": 512,
        "kernel": 3,
        "stride": 2
      },
      {
        "channels": 512,
        "kernel": 3,
        "stride": 2
      },
      {
        "channels": 512,
        "kernel": 3,
        "stride": 2
      },
      {
        "channels": 512,
        "kernel": 2,
        "stride": 2
      },
      {
        "channels": 512,
        "kernel": 2,
        "stride": 2
      }
    ],
    "normalize_audio": true
  },
  "pos_encoder": {
    "kind": "wav2vec2_conv",
    "kernel_size": 128,
    "num_groups": 16,
    "weight_norm": true
  },
  "encoder": {
    "kind": "standard_transformer",
    "norm_order": "pre",
    "num_layers": 60,
    "model_dim": 2048,
    "num_heads": 32,
    "ffn_dim": 8192
  },
  "ctc_head": {
    "vocab_size": 10288
  },
  "tokenizer": {
    "kind": "sentencepiece",
    "file": "tokenizer.model",
    "bos_idx": 0,
    "pad_idx": 1,
    "eos_idx": 2,
    "unk_idx": 3
  },
  "quantization": {
    "bits": 4,
    "group_size": 64,
    "method": "minmax_per_group",
    "targets": [
      "self_attn.q_proj.weight",
      "self_attn.k_proj.weight",
      "self_attn.v_proj.weight",
      "self_attn.output_proj.weight",
      "ffn.inner_proj.weight",
      "ffn.output_proj.weight",
      "final_proj.weight"
    ]
  }
}