Instructions to use yapwithai/kyutai-tts-1.6b-en_fr with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Moshi
How to use yapwithai/kyutai-tts-1.6b-en_fr with Moshi:
# pip install moshi # Run the interactive web server python -m moshi.server --hf-repo "yapwithai/kyutai-tts-1.6b-en_fr" # Then open https://localhost:8998 in your browser
# pip install moshi import torch from moshi.models import loaders # Load checkpoint info from HuggingFace checkpoint = loaders.CheckpointInfo.from_hf_repo("yapwithai/kyutai-tts-1.6b-en_fr") # Load the Mimi audio codec mimi = checkpoint.get_mimi(device="cuda") mimi.set_num_codebooks(8) # Encode audio (24kHz, mono) wav = torch.randn(1, 1, 24000 * 10) # [batch, channels, samples] with torch.no_grad(): codes = mimi.encode(wav.cuda()) decoded = mimi.decode(codes) - Notebooks
- Google Colab
- Kaggle
| { | |
| "card": 2048, | |
| "n_q": 32, | |
| "dep_q": 32, | |
| "delays": [ | |
| 0, | |
| 0, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "dim": 2048, | |
| "text_card": 8000, | |
| "existing_text_padding_id": 3, | |
| "num_heads": 16, | |
| "num_layers": 16, | |
| "hidden_scale": 4.125, | |
| "causal": true, | |
| "layer_scale": null, | |
| "context": 500, | |
| "max_period": 10000, | |
| "gating": "silu", | |
| "norm": "rms_norm_f32", | |
| "positional_embedding": "rope", | |
| "depformer_dim": 1024, | |
| "depformer_num_heads": 16, | |
| "depformer_num_layers": 4, | |
| "depformer_dim_feedforward": 3072, | |
| "depformer_multi_linear": true, | |
| "depformer_pos_emb": "none", | |
| "depformer_weights_per_step": true, | |
| "depformer_low_rank_embeddings": 128, | |
| "demux_second_stream": true, | |
| "text_card_out": null, | |
| "conditioners": { | |
| "speaker_wavs": { | |
| "type": "tensor", | |
| "tensor": { | |
| "dim": 512 | |
| } | |
| }, | |
| "cfg": { | |
| "type": "lut", | |
| "lut": { | |
| "n_bins": 7, | |
| "dim": 16, | |
| "tokenizer": "noop", | |
| "possible_values": [ | |
| "1.0", | |
| "1.5", | |
| "2.0", | |
| "2.5", | |
| "3.0", | |
| "3.5", | |
| "4.0" | |
| ] | |
| } | |
| }, | |
| "control": { | |
| "type": "lut", | |
| "lut": { | |
| "dim": 2048, | |
| "n_bins": 1, | |
| "tokenizer": "noop", | |
| "possible_values": [ | |
| "ok" | |
| ] | |
| } | |
| } | |
| }, | |
| "fuser": { | |
| "cross_attention_pos_emb": true, | |
| "cross_attention_pos_emb_scale": 1, | |
| "sum": [ | |
| "control", | |
| "cfg" | |
| ], | |
| "prepend": [], | |
| "cross": [ | |
| "speaker_wavs" | |
| ] | |
| }, | |
| "cross_attention": true, | |
| "tts_config": { | |
| "audio_delay": 1.28, | |
| "second_stream_ahead": 2 | |
| }, | |
| "model_id": { | |
| "sig": "1e68beda", | |
| "epoch": 240 | |
| }, | |
| "depformer_weights_per_step_schedule": [ | |
| 0, | |
| 1, | |
| 2, | |
| 3, | |
| 4, | |
| 5, | |
| 6, | |
| 7, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 8, | |
| 9, | |
| 9, | |
| 9, | |
| 9, | |
| 9, | |
| 9, | |
| 9, | |
| 9, | |
| 10, | |
| 10, | |
| 10, | |
| 10, | |
| 10, | |
| 10, | |
| 10, | |
| 10 | |
| ], | |
| "model_type": "tts", | |
| "lm_gen_config": { | |
| "temp": 0.6, | |
| "text_temp": 0.6 | |
| }, | |
| "tokenizer_name": "tokenizer_spm_8k_en_fr_audio.model", | |
| "mimi_name": "tokenizer-e351c8d8-checkpoint125.safetensors", | |
| "moshi_name": "dsm_tts_1e68beda@240.safetensors" | |
| } |