--- license: apache-2.0 base_model: - k2-fsa/OmniVoice pipeline_tag: text-to-speech tags: - comfyui - omnivoice - transformers - bf16 language: - "aae" - "aal" - "aao" - "ab" - "abb" - "abn" - "abr" - "abs" - "abv" - "acm" - "acw" - "acx" - "adf" - "adx" - "ady" - "aeb" - "aec" - "af" - "afb" - "afo" - "ahl" - "ahs" - "ajg" - "aju" - "ala" - "aln" - "alo" - "am" - "amu" - "an" - "anc" - "ank" - "anp" - "anw" - "aom" - "apc" - "apd" - "arb" - "arq" - "ars" - "ary" - "arz" - "as" - "ast" - "avl" - "awo" - "ayl" - "ayp" - "az" - "ba" - "bag" - "bas" - "bax" - "bba" - "bbj" - "bbl" - "bbu" - "bce" - "bci" - "bcs" - "bcy" - "bda" - "bde" - "bdm" - "be" - "beb" - "bew" - "bfd" - "bft" - "bg" - "bgp" - "bhb" - "bhh" - "bho" - "bhp" - "bhr" - "bjj" - "bjk" - "bjn" - "bjt" - "bkh" - "bkm" - "bky" - "bmm" - "bmq" - "bn" - "bnm" - "bnn" - "bns" - "bo" - "bou" - "bqg" - "br" - "bra" - "brh" - "bri" - "brx" - "bs" - "bsh" - "bsj" - "bsk" - "btm" - "btv" - "bug" - "bum" - "buo" - "bux" - "bwr" - "bxf" - "byc" - "bys" - "byv" - "byx" - "bzc" - "bzw" - "ca" - "ccg" - "ceb" - "cen" - "cfa" - "cgg" - "chq" - "cjk" - "ckb" - "ckl" - "ckr" - "cky" - "cnh" - "cpy" - "cs" - "cte" - "ctl" - "cut" - "cux" - "cv" - "cy" - "da" - "dag" - "dar" - "dav" - "dbd" - "dcc" - "de" - "deg" - "dgh" - "dgo" - "dje" - "dmk" - "dml" - "dru" - "dty" - "dua" - "dv" - "dyu" - "dzg" - "ebr" - "ebu" - "ego" - "eiv" - "eko" - "ekr" - "el" - "elm" - "en" - "eo" - "es" - "esu" - "et" - "eto" - "ets" - "etu" - "eu" - "ewo" - "ext" - "eyo" - "fa" - "fan" - "fat" - "ff" - "ffm" - "fi" - "fia" - "fil" - "fip" - "fkk" - "fmp" - "fr" - "fub" - "fuc" - "fue" - "fuf" - "fuh" - "fui" - "fuq" - "fuv" - "fy" - "ga" - "gbm" - "gbr" - "gby" - "gcc" - "gdf" - "gej" - "ges" - "ggg" - "gid" - "gig" - "giz" - "gjk" - "gju" - "gl" - "glw" - "gn" - "gol" - "gom" - "gsl" - "gu" - "gui" - "gur" - "guz" - "gv" - "gwc" - "gwe" - "gwt" - "gya" - "gyz" - "ha" - "hah" - "hao" - "haw" - "haz" - "hbb" - "he" - "hem" - "hi" - "hia" - "hkk" - "hla" - "hno" - "hoj" - "hr" - "hsb" - "ht" - "hu" - "hue" - "hul" - "hux" - "hwo" - "hy" - "hz" - "ia" - "ibb" - "id" - "ida" - "idu" - "ig" - "ijc" - "ijn" - "ik" - "ikw" - "is" - "ish" - "iso" - "it" - "its" - "itw" - "itz" - "ja" - "jal" - "jax" - "jgo" - "jmx" - "jns" - "jqr" - "juk" - "juo" - "jv" - "ka" - "kab" - "kai" - "kaj" - "kam" - "kbd" - "kbl" - "kbt" - "kcq" - "kdh" - "kea" - "keu" - "kfe" - "kfk" - "kfp" - "khg" - "khw" - "kj" - "kjc" - "kjk" - "kk" - "kln" - "kls" - "km" - "kmr" - "kmy" - "kn" - "kna" - "knn" - "ko" - "kol" - "koo" - "kpo" - "kqo" - "ks" - "ksd" - "ksf" - "kto" - "kuh" - "kvx" - "kw" - "kwm" - "kxp" - "ky" - "kyx" - "lag" - "lb" - "lcm" - "ldb" - "lg" - "lij" - "lir" - "lkb" - "lla" - "ln" - "lnu" - "lo" - "loa" - "lrk" - "lss" - "lt" - "ltg" - "lto" - "lua" - "luo" - "lus" - "lv" - "lwg" - "mab" - "maf" - "mai" - "mau" - "max" - "mbo" - "mcf" - "mcn" - "mcx" - "mdd" - "mde" - "mdf" - "mek" - "mer" - "meu" - "mfm" - "mfn" - "mfo" - "mfv" - "mgg" - "mgi" - "mhk" - "mhr" - "mi" - "mig" - "miu" - "mk" - "mkf" - "mki" - "ml" - "mlq" - "mn" - "mne" - "mni" - "mqy" - "mr" - "mrj" - "mrr" - "mrt" - "ms" - "mse" - "msh" - "msw" - "mt" - "mtr" - "mtu" - "mtx" - "mua" - "mug" - "mui" - "mve" - "mvy" - "mxs" - "mxu" - "mxy" - "my" - "myv" - "mzl" - "nal" - "nan" - "nap" - "nb" - "nbh" - "ncf" - "nco" - "ncx" - "ndi" - "ng" - "ngi" - "nhg" - "nhi" - "nhn" - "nhq" - "nja" - "nl" - "nla" - "nlv" - "nmg" - "nmz" - "nn" - "nnh" - "no" - "noe" - "npi" - "nso" - "ny" - "nyu" - "oc" - "odk" - "odu" - "ogo" - "om" - "orc" - "oru" - "ory" - "os" - "pa" - "pbs" - "pbt" - "pbu" - "pcm" - "pex" - "phl" - "phr" - "pip" - "piy" - "pko" - "pl" - "plk" - "plt" - "pmq" - "pms" - "pmy" - "pnb" - "poc" - "poe" - "pow" - "prq" - "ps" - "pst" - "pt" - "pua" - "pwn" - "qug" - "qum" - "qup" - "qur" - "qus" - "quv" - "qux" - "quy" - "qva" - "qvi" - "qvj" - "qvl" - "qwa" - "qws" - "qxa" - "qxp" - "qxt" - "qxu" - "qxw" - "rag" - "rm" - "ro" - "rob" - "rof" - "roo" - "rth" - "ru" - "rup" - "rw" - "sa" - "sah" - "sat" - "sau" - "say" - "sbn" - "sc" - "scl" - "scn" - "sd" - "sei" - "shu" - "si" - "sip" - "siw" - "sjr" - "sk" - "skg" - "skr" - "sl" - "sn" - "snc" - "snk" - "so" - "sol" - "sps" - "sq" - "sr" - "src" - "sro" - "ssi" - "ste" - "sua" - "sv" - "sva" - "sw" - "szy" - "ta" - "tan" - "tar" - "tay" - "tbf" - "tcf" - "tcy" - "tdn" - "tdx" - "te" - "tg" - "tgc" - "th" - "the" - "thq" - "thr" - "thv" - "ti" - "tig" - "tio" - "tk" - "tkg" - "tkt" - "tli" - "tlp" - "tn" - "tok" - "tpl" - "tpz" - "tqp" - "tr" - "trp" - "trq" - "trv" - "trw" - "tt" - "ttj" - "ttr" - "ttu" - "tui" - "tul" - "tuq" - "tuv" - "tuy" - "tvo" - "tvu" - "tw" - "twu" - "txs" - "txy" - "udl" - "ug" - "uk" - "uki" - "umb" - "ur" - "ush" - "uz" - "uzn" - "vai" - "var" - "ver" - "vi" - "vmc" - "vmj" - "vmm" - "vmp" - "vmz" - "vot" - "vro" - "wbl" - "wci" - "weo" - "wes" - "wja" - "wji" - "wo" - "wof" - "xh" - "xhe" - "xka" - "xmf" - "xmv" - "xmw" - "xpe" - "xti" - "xtu" - "yaq" - "yav" - "yay" - "ydd" - "ydg" - "yer" - "yes" - "yi" - "yo" - "yue" - "zga" - "zgh" - "zh" - "zoc" - "zoh" - "zor" - "zpv" - "zpy" - "ztg" - "ztn" - "ztp" - "zts" - "ztu" - "zu" - "zza" --- # OmniVoice-bf16 🌍 **BF16 quantized version of [k2-fsa/OmniVoice](https://huggingface.co/k2-fsa/OmniVoice).** ![Screenshot 2026-04-02 203949](https://cdn-uploads.huggingface.co/production/uploads/63473b59e5c0717e6737b872/iCeCjyFAAl7qFPRpvCiTb.png) [**Original Model**](https://huggingface.co/k2-fsa/OmniVoice) | [**Paper**](https://arxiv.org/abs/2604.00688) | [**GitHub (Original)**](https://github.com/k2-fsa/OmniVoice) | [**HuggingFace Space**](https://huggingface.co/spaces/k2-fsa/OmniVoice) | [**Demo Page**](https://zhu-han.github.io/omnivoice) | [**ComfyUI Node**](https://github.com/Saganaki22/ComfyUI-OmniVoice-TTS) --- ## What is this? This is a BF16 conversion of OmniVoice — a state-of-the-art zero-shot multilingual TTS model supporting 600+ languages, built on a diffusion language model architecture. Converting from FP32 to BF16 halves the on-disk size and VRAM usage with negligible quality loss, making it the recommended variant for most users. | | Original (FP32) | This (BF16) | |---|---|---| | **Weight dtype** | float32 | bfloat16 | | **Activation dtype** | float32 | bfloat16 | | **File size** | ~Full size | ~Half size | | **VRAM (inference)** | Higher | ~Halved | | **Quality** | Reference | Virtually identical | | **Extra dependencies** | none | none | --- ## Conversion Details All model weights are converted from float32 to bfloat16. BF16 preserves the same dynamic range as FP32 (8 exponent bits) while halving memory usage, making it the lossless practical choice for inference on modern GPUs. No post-training quantization, calibration data, or scale factors are required. The model is a direct dtype cast and is fully compatible with the original `omnivoice` inference code. --- ## Hardware Requirements - **GPU:** NVIDIA GPU with CUDA support (BF16 natively supported on Ampere and newer; falls back gracefully on older hardware) - **CPU:** Supported but slow --- ## Usage This model is a drop-in replacement for `k2-fsa/OmniVoice`. Simply swap the model ID in any existing OmniVoice workflow. --- ## Usage — ComfyUI (Recommended) The easiest way to use this model is with **[ComfyUI-OmniVoice-TTS](https://github.com/Saganaki22/ComfyUI-OmniVoice-TTS)**, which has native support for this BF16 model with zero extra setup. ### Installation 1. Install the ComfyUI node via **ComfyUI Manager** (search `OmniVoice`) or manually: ```bash cd ComfyUI/custom_nodes git clone https://github.com/Saganaki22/ComfyUI-OmniVoice-TTS.git ``` 2. The model **auto-downloads on first use** — select `OmniVoice-bf16` from the model dropdown in any OmniVoice node. 3. Or download manually: ```bash huggingface-cli download drbaph/OmniVoice-bf16 --local-dir ComfyUI/models/omnivoice/OmniVoice-bf16 ``` ### Recommended Settings - `dtype`: `auto` or `bf16` — matches this model's native dtype - `num_step`: `16` (balanced), `32` (higher quality) - `keep_model_loaded`: `True` for repeated use > **This is the recommended variant** for most users — best balance of quality, VRAM usage, and compatibility. --- ### Python API ```python from omnivoice import OmniVoice import torch import torchaudio model = OmniVoice.from_pretrained( "drbaph/OmniVoice-bf16", device_map="cuda:0", dtype=torch.bfloat16 # matches native dtype of this model ) # Voice Cloning audio = model.generate( text="Hello, this is a test of zero-shot voice cloning.", ref_audio="ref.wav", ref_text="Transcription of the reference audio.", ) torchaudio.save("out.wav", audio[0], 24000) ``` > Set `dtype=torch.bfloat16` or `dtype="auto"` to match this model's native dtype and avoid any unnecessary casting overhead. ### Voice Design ```python audio = model.generate( text="Hello, this is a test of zero-shot voice design.", instruct="female, low pitch, british accent", ) ``` ### Auto Voice ```python audio = model.generate(text="This is a sentence without any voice prompt.") ``` ### Recommended Settings - `dtype`: `auto` or `bf16` — matches this model's native dtype - `num_step`: `16` (balanced), `32` (higher quality) - `speed`: `1.0` (default) For the full Python API reference, generation parameters, non-verbal symbols, pronunciation control, and batch inference, see the [original model card](https://huggingface.co/k2-fsa/OmniVoice). --- ## About OmniVoice OmniVoice is a state-of-the-art zero-shot multilingual TTS model from [k2-fsa](https://huggingface.co/k2-fsa) supporting 600+ languages. Built on a novel diffusion language model architecture, it generates high-quality speech with superior inference speed (RTF as low as 0.025 — 40× faster than real-time), supporting voice cloning and voice design. Key features: **600+ languages**, **zero-shot voice cloning**, **voice design** (gender, age, pitch, accent, dialect, etc.), and **fast diffusion-based inference**. --- ## License This model inherits the [Apache 2.0 License](https://huggingface.co/k2-fsa/OmniVoice) from [k2-fsa/OmniVoice](https://huggingface.co/k2-fsa/OmniVoice). The BF16 conversion was produced by [drbaph](https://huggingface.co/drbaph) and is released under the same license. --- ## Citation ```bibtex @article{zhu2026omnivoice, title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models}, author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel}, journal={arXiv preprint arXiv:2604.00688}, year={2026} } ```