| { |
| "model_id": "nvidia/parakeet-tdt-0.6b-v3", |
| "format": "coreml-fp16", |
| "sample_rate": 16000, |
| "max_audio_seconds": 30.0, |
| "max_audio_samples": 480000, |
| "max_symbol_steps": 1, |
| "vocab_size": 8192, |
| "joint_extra_outputs": 5, |
| "checkpoint": { |
| "type": "pretrained", |
| "model_id": "nvidia/parakeet-tdt-0.6b-v3" |
| }, |
| "coreml": { |
| "compute_units": "CPU_AND_NEURAL_ENGINE", |
| "compute_precision": "FLOAT16" |
| }, |
| "components": { |
| "mel_encoder": { |
| "description": "Fused preprocessor+encoder: raw audio to encoder frames in one model (ANE-accelerated).", |
| "inputs": { |
| "audio_signal": [1, 480000], |
| "audio_length": [1] |
| }, |
| "outputs": { |
| "encoder": [1, 1024, 375], |
| "encoder_length": [1] |
| }, |
| "path": "parakeet_mel_encoder_30s.mlpackage" |
| }, |
| "decoder": { |
| "description": "LSTM prediction network (CPU).", |
| "inputs": { |
| "targets": [1, 1], |
| "target_length": [1], |
| "h_in": [2, 1, 640], |
| "c_in": [2, 1, 640] |
| }, |
| "outputs": { |
| "decoder": [1, 640, 1], |
| "h_out": [2, 1, 640], |
| "c_out": [2, 1, 640] |
| }, |
| "path": "parakeet_decoder.mlpackage" |
| }, |
| "joint_logits_single_step": { |
| "description": "Joint network exposing full-vocab token logits and duration logits (CPU). Enables host-side medical term boosting.", |
| "inputs": { |
| "encoder_step": [1, 1024, 1], |
| "decoder_step": [1, 640, 1] |
| }, |
| "outputs": { |
| "token_logits": [1, 1, 1, 8193], |
| "duration_logits": [1, 1, 1, 5] |
| }, |
| "path": "parakeet_joint_logits_single_step.mlpackage" |
| } |
| } |
| } |
|
|