| { |
| "feature_extraction": { |
| "sequence": [ |
| { |
| "operation": { |
| "name": "audio_decoder", |
| "type": "AudioDecoderEx", |
| "attrs": { |
| "target_sample_rates": [ |
| 8000, |
| 16000 |
| ] |
| } |
| } |
| }, |
| { |
| "operation": { |
| "name": "phi_4_audio_embed", |
| "type": "Phi4AudioEmbed", |
| "attrs": { |
| "audio_compression_rate": 8, |
| "stft_normal/n_fft": 512, |
| "stft_normal/frame_length": 400, |
| "stft_normal/hop_length": 160, |
| "stft_normal/win_fn": "hamming", |
| "logmel/chunk_size": 30, |
| "logmel/hop_length": 160, |
| "logmel/n_fft": 512, |
| "logmel/n_mel": 80, |
| "logmel/feature_first": 0, |
| "logmel/no_padding": 1, |
| "stft_normal_8k/n_fft": 256, |
| "stft_normal_8k/frame_length": 200, |
| "stft_normal_8k/hop_length": 80, |
| "stft_normal_8k/win_fn": "hamming", |
| "logmel_8k/chunk_size": 30, |
| "logmel_8k/hop_length": 80, |
| "logmel_8k/n_fft": 512, |
| "logmel_8k/n_mel": 80, |
| "logmel_8k/feature_first": 0, |
| "logmel_8k/no_padding": 1 |
| } |
| } |
| } |
| ], |
| "output_aligner": "phi4-audio-aligner" |
| } |
| } |