beaupi
/

VibeVoice-ASR-oQ3.5-fp16

+---
+language:
+- en   # English
+- zh   # Chinese
+- es   # Spanish
+- pt   # Portuguese
+- de   # German
+- ja   # Japanese
+- ko   # Korean
+- fr   # French
+- ru   # Russian
+- id   # Indonesian
+- sv   # Swedish
+- it   # Italian
+- he   # Hebrew
+- nl   # Dutch
+- pl   # Polish
+- no   # Norwegian
+- tr   # Turkish
+- th   # Thai
+- ar   # Arabic
+- hu   # Hungarian
+- ca   # Catalan
+- cs   # Czech
+- da   # Danish
+- fa   # Persian
+- af   # Afrikaans
+- hi   # Hindi
+- fi   # Finnish
+- et   # Estonian
+- aa   # Afar
+- el   # Greek
+- ro   # Romanian
+- vi   # Vietnamese
+- bg   # Bulgarian
+- is   # Icelandic
+- sl   # Slovenian
+- sk   # Slovak
+- lt   # Lithuanian
+- sw   # Swahili
+- uk   # Ukrainian
+- kl   # Kalaallisut
+- lv   # Latvian
+- hr   # Croatian
+- ne   # Nepali
+- sr   # Serbian
+- tl   # Filipino (ISO 639-1; 常见工程别名: fil)
+- yi   # Yiddish
+- ms   # Malay
+- ur   # Urdu
+- mn   # Mongolian
+- hy   # Armenian
+- jv   # Javanese
+license: mit
+pipeline_tag: automatic-speech-recognition
+tags:
+- ASR
+- Transcriptoin
+- Diarization
+- Speech-to-Text
+library_name: transformers
+---
+## VibeVoice-ASR
+[![GitHub](https://img.shields.io/badge/GitHub-Repo-black?logo=github)](https://github.com/microsoft/VibeVoice)
+[![Live Playground](https://img.shields.io/badge/Live-Playground-green?logo=gradio)](https://aka.ms/vibevoice-asr)
+[![Technical Report](https://img.shields.io/badge/arXiv-2601.18184-b31b1b?logo=arxiv)](https://arxiv.org/pdf/2601.18184)
+**VibeVoice-ASR** is a unified speech-to-text model designed to handle **60-minute long-form audio** in a single pass, generating structured transcriptions containing **Who (Speaker), When (Timestamps), and What (Content)**, with support for **Customized Hotwords** and over **50 languages**.
+➡️ **Code:** [microsoft/VibeVoice](https://github.com/microsoft/VibeVoice)<br>
+➡️ **Demo:** [VibeVoice-ASR-Demo](https://aka.ms/vibevoice-asr)<br>
+➡️ **Report:** [VibeVoice-ASR Technical Report](https://arxiv.org/pdf/2601.18184)<br>
+➡️ **Finetuning:** [Finetuning](https://github.com/microsoft/VibeVoice/blob/main/finetuning-asr/README.md)<br>
+➡️ **vLLM:** [vLLM-VibeVoice-ASR](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-vllm-asr.md)<br>
+<p align="left">
+  <img src="figures/VibeVoice_ASR_archi.png" alt="VibeVoice-ASR Architecture" height="250px">
+</p>
+## 🔥 Key Features
+- **🕒 60-minute Single-Pass Processing**:
+  Unlike conventional ASR models that slice audio into short chunks (often losing global context), VibeVoice ASR accepts up to **60 minutes** of continuous audio input within 64K token length. This ensures consistent speaker tracking and semantic coherence across the entire hour.
+- **👤 Customized Hotwords**:
+  Users can provide customized hotwords (e.g., specific names, technical terms, or background info) to guide the recognition process, significantly improving accuracy on domain-specific content.
+- **📝 Rich Transcription (Who, When, What)**:
+  The model jointly performs ASR, diarization, and timestamping, producing a structured output that indicates *who* said *what* and *when*.
+- **🌍 Multilingual & Code-Switching Support**:
+  It supports over 50 languages, requires no explicit language setting, and natively handles code-switching within and across utterances. Language distribution can be found [here](#language-distribution).
+## Evaluation
+<p align="center">
+  <img src="figures/DER.jpg" alt="DER" width="70%">
+  <img src="figures/cpWER.jpg" alt="cpWER" width="70%">
+  <img src="figures/tcpWER.jpg" alt="tcpWER" width="70%">
+</p>
+## Installation and Usage
+Please refer to [GitHub README](https://github.com/microsoft/VibeVoice/blob/main/docs/vibevoice-asr.md#installation).
+## Language Distribution
+<p align="center">
+  <img src="figures/language_distribution_horizontal.png" alt="Language Distribution" width="80%">
+</p>
+## License
+This project is licensed under the MIT License.
+## Contact
+This project was conducted by members of Microsoft Research. We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us at VibeVoice@microsoft.com.
+If the team receives reports of undesired behavior or identifies issues independently, we will update this repository with appropriate mitigations.

config.json ADDED Viewed

	@@ -0,0 +1,218 @@

+{
+  "_attn_implementation_autoset": false,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "acoustic_vae_dim": 64,
+  "architectures": [
+    "VibeVoiceForASRTraining"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 131072,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibepod_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "dtype": "float32",
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "dtype": "bfloat16",
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "transformers_version": "4.57.6",
+  "quantization": {
+    "group_size": 64,
+    "bits": 3,
+    "mode": "affine",
+    "model.language_model.embed_tokens": {
+      "bits": 8,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.o_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.v_proj": {
+      "bits": 6,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.1.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.2.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    }
+  },
+  "quantization_config": {
+    "group_size": 64,
+    "bits": 3,
+    "mode": "affine",
+    "model.language_model.embed_tokens": {
+      "bits": 8,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.o_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.0.self_attn.v_proj": {
+      "bits": 6,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.1.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    },
+    "model.language_model.layers.2.self_attn.k_proj": {
+      "bits": 5,
+      "group_size": 64,
+      "mode": "affine"
+    }
+  }
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a9aee305c7685742d900881fc4d85bf1c03aa33742cc48f7d2c9b9ca4217d8c
+size 4338610607