{
  "model_id": "nvidia/parakeet-tdt-0.6b-v3",
  "format": "coreml-fp16",
  "sample_rate": 16000,
  "max_audio_seconds": 30.0,
  "max_audio_samples": 480000,
  "max_symbol_steps": 1,
  "vocab_size": 8192,
  "joint_extra_outputs": 5,
  "checkpoint": {
    "type": "pretrained",
    "model_id": "nvidia/parakeet-tdt-0.6b-v3"
  },
  "coreml": {
    "compute_units": "CPU_AND_NEURAL_ENGINE",
    "compute_precision": "FLOAT16"
  },
  "components": {
    "mel_encoder": {
      "description": "Fused preprocessor+encoder: raw audio to encoder frames in one model (ANE-accelerated).",
      "inputs": {
        "audio_signal": [1, 480000],
        "audio_length": [1]
      },
      "outputs": {
        "encoder": [1, 1024, 375],
        "encoder_length": [1]
      },
      "path": "parakeet_mel_encoder_30s.mlpackage"
    },
    "decoder": {
      "description": "LSTM prediction network (CPU).",
      "inputs": {
        "targets": [1, 1],
        "target_length": [1],
        "h_in": [2, 1, 640],
        "c_in": [2, 1, 640]
      },
      "outputs": {
        "decoder": [1, 640, 1],
        "h_out": [2, 1, 640],
        "c_out": [2, 1, 640]
      },
      "path": "parakeet_decoder.mlpackage"
    },
    "joint_logits_single_step": {
      "description": "Joint network exposing full-vocab token logits and duration logits (CPU). Enables host-side medical term boosting.",
      "inputs": {
        "encoder_step": [1, 1024, 1],
        "decoder_step": [1, 640, 1]
      },
      "outputs": {
        "token_logits": [1, 1, 1, 8193],
        "duration_logits": [1, 1, 1, 5]
      },
      "path": "parakeet_joint_logits_single_step.mlpackage"
    }
  }
}