{ "model_id": "nvidia/parakeet-tdt-0.6b-v3", "format": "coreml-fp16", "sample_rate": 16000, "max_audio_seconds": 30.0, "max_audio_samples": 480000, "max_symbol_steps": 1, "vocab_size": 8192, "joint_extra_outputs": 5, "checkpoint": { "type": "pretrained", "model_id": "nvidia/parakeet-tdt-0.6b-v3" }, "coreml": { "compute_units": "CPU_AND_NEURAL_ENGINE", "compute_precision": "FLOAT16" }, "components": { "mel_encoder": { "description": "Fused preprocessor+encoder: raw audio to encoder frames in one model (ANE-accelerated).", "inputs": { "audio_signal": [1, 480000], "audio_length": [1] }, "outputs": { "encoder": [1, 1024, 375], "encoder_length": [1] }, "path": "parakeet_mel_encoder_30s.mlpackage" }, "decoder": { "description": "LSTM prediction network (CPU).", "inputs": { "targets": [1, 1], "target_length": [1], "h_in": [2, 1, 640], "c_in": [2, 1, 640] }, "outputs": { "decoder": [1, 640, 1], "h_out": [2, 1, 640], "c_out": [2, 1, 640] }, "path": "parakeet_decoder.mlpackage" }, "joint_logits_single_step": { "description": "Joint network exposing full-vocab token logits and duration logits (CPU). Enables host-side medical term boosting.", "inputs": { "encoder_step": [1, 1024, 1], "decoder_step": [1, 640, 1] }, "outputs": { "token_logits": [1, 1, 1, 8193], "duration_logits": [1, 1, 1, 5] }, "path": "parakeet_joint_logits_single_step.mlpackage" } } }