Automatic Speech Recognition
Transformers
Safetensors
Danish
cohere_asr
audio
speech-recognition
transcription
danish
hf-asr-leaderboard
custom_code
Instructions to use syvai/hviske-v5.1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use syvai/hviske-v5.1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="syvai/hviske-v5.1", trust_remote_code=True)# Load model directly from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq processor = AutoProcessor.from_pretrained("syvai/hviske-v5.1", trust_remote_code=True) model = AutoModelForSpeechSeq2Seq.from_pretrained("syvai/hviske-v5.1", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
Upload fine-tuned Danish ASR (hviske v5.1) — coral_read_aloud WER 19.5%, coral_conversation WER 25.5%
d382915 verified | { | |
| "architectures": [ | |
| "CohereAsrForConditionalGeneration" | |
| ], | |
| "auto_map": { | |
| "AutoConfig": "configuration_cohere_asr.CohereAsrConfig", | |
| "AutoFeatureExtractor": "processing_cohere_asr.CohereAsrFeatureExtractor", | |
| "AutoModel": "modeling_cohere_asr.CohereAsrModel", | |
| "AutoModelForSpeechSeq2Seq": "modeling_cohere_asr.CohereAsrForConditionalGeneration", | |
| "AutoProcessor": "processing_cohere_asr.CohereAsrProcessor", | |
| "AutoTokenizer": "tokenization_cohere_asr.CohereAsrTokenizer" | |
| }, | |
| "batch_size": 128, | |
| "decoding": { | |
| "beam": { | |
| "beam_size": 1, | |
| "len_pen": 0.0, | |
| "max_generation_delta": 50 | |
| }, | |
| "return_best_hypothesis": true, | |
| "strategy": "beam" | |
| }, | |
| "dtype": "bfloat16", | |
| "encoder": { | |
| "att_context_size": [ | |
| -1, | |
| -1 | |
| ], | |
| "causal_downsampling": false, | |
| "conv_context_size": null, | |
| "conv_kernel_size": 9, | |
| "conv_norm_type": "batch_norm", | |
| "d_model": 1280, | |
| "dropout": 0, | |
| "dropout_att": 0, | |
| "dropout_emb": 0, | |
| "dropout_pre_encoder": 0, | |
| "feat_in": 128, | |
| "feat_out": -1, | |
| "ff_expansion_factor": 4, | |
| "n_heads": 8, | |
| "n_layers": 48, | |
| "pos_emb_max_len": 5000, | |
| "reduction": null, | |
| "reduction_factor": 1, | |
| "reduction_position": null, | |
| "self_attention_model": "rel_pos", | |
| "subsampling": "dw_striding", | |
| "subsampling_conv_channels": 256, | |
| "subsampling_factor": 8, | |
| "untie_biases": true, | |
| "xscaling": false | |
| }, | |
| "head": { | |
| "activation": "relu", | |
| "dropout": 0, | |
| "hidden_size": 1024, | |
| "log_softmax": true, | |
| "num_classes": 16384, | |
| "num_layers": 1, | |
| "use_transformer_init": true | |
| }, | |
| "is_encoder_decoder": true, | |
| "log_batch_stats": false, | |
| "log_prediction": true, | |
| "max_audio_clip_s": 35, | |
| "max_seq_len": 1024, | |
| "min_energy_window_samples": 1600, | |
| "model_defaults": { | |
| "asr_enc_hidden": 1280, | |
| "lm_dec_hidden": 1024, | |
| "lm_enc_hidden": 1024 | |
| }, | |
| "model_type": "cohere_asr", | |
| "multitask_metrics_cfg": { | |
| "log_predictions": true, | |
| "metrics": { | |
| "wer": { | |
| "constraint": ".source_lang==.target_lang" | |
| } | |
| } | |
| }, | |
| "overlap_chunk_second": 5, | |
| "preprocessor": { | |
| "dither": 1e-05, | |
| "features": 128, | |
| "frame_splicing": 1, | |
| "log": true, | |
| "n_fft": 512, | |
| "normalize": "per_feature", | |
| "pad_to": 0, | |
| "pad_value": 0.0, | |
| "sample_rate": 16000, | |
| "window": "hann", | |
| "window_size": 0.025, | |
| "window_stride": 0.01 | |
| }, | |
| "prompt_defaults": [ | |
| { | |
| "role": "user", | |
| "slots": { | |
| "decodercontext": "", | |
| "diarize": "<|nodiarize|>", | |
| "emotion": "<|emo:undefined|>", | |
| "itn": "<|noitn|>", | |
| "pnc": "<|pnc|>", | |
| "source_lang": "<|en|>", | |
| "target_lang": "<|en|>", | |
| "timestamp": "<|notimestamp|>" | |
| } | |
| }, | |
| { | |
| "role": "user_partial", | |
| "slots": { | |
| "decodercontext": "" | |
| } | |
| } | |
| ], | |
| "prompt_format": "cohere_asr", | |
| "sample_rate": 16000, | |
| "supported_languages": [ | |
| "en", | |
| "fr", | |
| "de", | |
| "es", | |
| "it", | |
| "pt", | |
| "nl", | |
| "pl", | |
| "el", | |
| "ar", | |
| "ja", | |
| "zh", | |
| "vi", | |
| "ko", | |
| "da" | |
| ], | |
| "transf_decoder": { | |
| "config_dict": { | |
| "attn_layer_dropout": 0, | |
| "attn_score_dropout": 0, | |
| "embedding_dropout": 0, | |
| "ffn_dropout": 0, | |
| "hidden_act": "relu", | |
| "hidden_size": 1024, | |
| "inner_size": 4096, | |
| "learn_positional_encodings": false, | |
| "lm_dec_hidden": 1280, | |
| "max_sequence_length": 1024, | |
| "num_attention_heads": 8, | |
| "num_layers": 8, | |
| "num_token_types": 0, | |
| "pre_ln": true, | |
| "vocab_size": "None" | |
| }, | |
| "encoder": null, | |
| "model_name": null, | |
| "pre_ln_final_layer_norm": true, | |
| "pretrained": false | |
| }, | |
| "transf_encoder": { | |
| "attn_layer_dropout": 0, | |
| "attn_score_dropout": 0, | |
| "ffn_dropout": 0, | |
| "hidden_size": 1024, | |
| "inner_size": 4096, | |
| "mask_future": false, | |
| "num_attention_heads": 8, | |
| "num_layers": 0, | |
| "pre_ln": true, | |
| "pre_ln_final_layer_norm": true | |
| }, | |
| "transformers_version": "4.57.6", | |
| "use_loss_mask_for_prompt": false, | |
| "vocab_size": 16384 | |
| } | |