Rcarvalo commited on
Commit
39f8fcd
·
verified ·
1 Parent(s): 7f7c362

Upload VibeVoice French fine-tuned model (SIWIS, 10 epochs, full FT)

Browse files
Files changed (5) hide show
  1. README.md +74 -0
  2. config.json +86 -0
  3. model.safetensors +3 -0
  4. preprocessor_config.json +13 -0
  5. train_config.json +20 -0
README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ base_model: microsoft/VibeVoice-Realtime-0.5B
4
+ tags:
5
+ - tts
6
+ - text-to-speech
7
+ - french
8
+ - vibevoice
9
+ - finetuned
10
+ language:
11
+ - fr
12
+ datasets:
13
+ - custom
14
+ pipeline_tag: text-to-speech
15
+ ---
16
+
17
+ # VibeVoice-Realtime-0.5B Fine-tuned (French SIWIS)
18
+
19
+ Fine-tuned version of [microsoft/VibeVoice-Realtime-0.5B](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B) on the French SIWIS dataset for improved French TTS.
20
+
21
+ ## Training Details
22
+
23
+ - **Base model**: microsoft/VibeVoice-Realtime-0.5B
24
+ - **Training data**: SIWIS French Speech Synthesis Database (~9,200 samples, 500 benchmark phrases excluded)
25
+ - **Training type**: Full fine-tuning of TTS language model (434M params)
26
+ - **Frozen components**: Acoustic tokenizer (VAE), prediction head (diffusion), language encoder (Qwen2.5 4 layers)
27
+
28
+ ### Hyperparameters
29
+
30
+ | Parameter | Value |
31
+ |-----------|-------|
32
+ | Epochs | 10 |
33
+ | Batch size | 4 |
34
+ | Gradient accumulation | 4 |
35
+ | Effective batch size | 16 |
36
+ | Learning rate | 5e-5 |
37
+ | Weight decay | 0.01 |
38
+ | Warmup steps | 500 |
39
+ | Precision | bf16 |
40
+
41
+ ### Hardware
42
+
43
+ - GPU: NVIDIA RTX 6000 Ada (49GB)
44
+
45
+ ## Benchmark Results (500 SIWIS French phrases)
46
+
47
+ | Metric | Value |
48
+ |--------|-------|
49
+ | WER (mean) | 35.0% |
50
+ | WER (median) | 22.9% |
51
+ | RTF (mean) | 0.416 |
52
+
53
+ ## Usage
54
+
55
+ ```python
56
+ import torch
57
+ import soundfile as sf
58
+ from vibevoice.modular.modeling_vibevoice_streaming_inference import (
59
+ VibeVoiceStreamingForConditionalGenerationInference,
60
+ )
61
+
62
+ model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
63
+ "Rcarvalo/vibevoice",
64
+ torch_dtype=torch.bfloat16,
65
+ ).to("cuda")
66
+
67
+ # Generate French speech
68
+ audio = model.generate(text="Bonjour, comment allez-vous aujourd'hui?")
69
+ sf.write("output.wav", audio.cpu().numpy(), 24000)
70
+ ```
71
+
72
+ ## License
73
+
74
+ MIT (same as base model)
config.json ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "acoustic_vae_dim": 64,
3
+ "acoustic_tokenizer_config": {
4
+ "causal": true,
5
+ "channels": 1,
6
+ "conv_bias": true,
7
+ "conv_norm": "none",
8
+ "corpus_normalize": 0.0,
9
+ "decoder_depths": null,
10
+ "decoder_n_filters": 32,
11
+ "decoder_ratios": [
12
+ 8,
13
+ 5,
14
+ 5,
15
+ 4,
16
+ 2,
17
+ 2
18
+ ],
19
+ "disable_last_norm": true,
20
+ "encoder_depths": "3-3-3-3-3-3-8",
21
+ "encoder_n_filters": 32,
22
+ "encoder_ratios": [
23
+ 8,
24
+ 5,
25
+ 5,
26
+ 4,
27
+ 2,
28
+ 2
29
+ ],
30
+ "fix_std": 0.5,
31
+ "layer_scale_init_value": 1e-06,
32
+ "layernorm": "RMSNorm",
33
+ "layernorm_elementwise_affine": true,
34
+ "layernorm_eps": 1e-05,
35
+ "mixer_layer": "depthwise_conv",
36
+ "model_type": "vibevoice_acoustic_tokenizer",
37
+ "pad_mode": "constant",
38
+ "std_dist_type": "gaussian",
39
+ "vae_dim": 64,
40
+ "weight_init_value": 0.01
41
+ },
42
+ "architectures": [
43
+ "VibeVoiceStreamingForConditionalGenerationInference"
44
+ ],
45
+ "decoder_config": {
46
+ "attention_dropout": 0.0,
47
+ "hidden_act": "silu",
48
+ "hidden_size": 896,
49
+ "initializer_range": 0.02,
50
+ "intermediate_size": 4864,
51
+ "max_position_embeddings": 8192,
52
+ "max_window_layers": 24,
53
+ "model_type": "qwen2",
54
+ "num_attention_heads": 14,
55
+ "num_hidden_layers": 24,
56
+ "num_key_value_heads": 2,
57
+ "rms_norm_eps": 1e-06,
58
+ "rope_scaling": null,
59
+ "rope_theta": 1000000.0,
60
+ "sliding_window": null,
61
+ "tie_word_embeddings": false,
62
+ "torch_dtype": "bfloat16",
63
+ "use_cache": true,
64
+ "use_sliding_window": false,
65
+ "vocab_size": 151936
66
+ },
67
+ "diffusion_head_config": {
68
+ "ddpm_batch_mul": 4,
69
+ "ddpm_beta_schedule": "cosine",
70
+ "ddpm_num_inference_steps": 20,
71
+ "ddpm_num_steps": 1000,
72
+ "diffusion_type": "ddpm",
73
+ "head_ffn_ratio": 3.0,
74
+ "head_layers": 4,
75
+ "hidden_size": 896,
76
+ "latent_size": 64,
77
+ "model_type": "vibevoice_diffusion_head",
78
+ "prediction_type": "v_prediction",
79
+ "rms_norm_eps": 1e-05,
80
+ "speech_vae_dim": 64
81
+ },
82
+ "model_type": "vibevoice_streaming",
83
+ "torch_dtype": "bfloat16",
84
+ "transformers_version": "4.51.3",
85
+ "tts_backbone_num_hidden_layers": 20
86
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dbfa06e998ae8f5130475768bd5dafe56b979e420f289ca80807df16158240d
3
+ size 1737558440
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "processor_class": "VibeVoiceStreamingProcessor",
3
+ "speech_tok_compress_ratio": 3200,
4
+ "db_normalize": true,
5
+ "audio_processor": {
6
+ "feature_extractor_type": "VibeVoiceTokenizerProcessor",
7
+ "sampling_rate": 24000,
8
+ "normalize_audio": true,
9
+ "target_dB_FS": -25,
10
+ "eps": 1e-06
11
+ },
12
+ "language_model_pretrained_name": "Qwen/Qwen2.5-0.5B"
13
+ }
train_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "data_dir": "data/tokenized_siwis,data/tokenized_siwis_full",
3
+ "val_dir": "data/tokenized_val_full",
4
+ "output_dir": "outputs/full_ft_vibevoice",
5
+ "epochs": 10,
6
+ "batch_size": 4,
7
+ "grad_accum": 4,
8
+ "lr": 5e-05,
9
+ "weight_decay": 0.01,
10
+ "warmup_steps": 500,
11
+ "full_finetune": true,
12
+ "lora_rank": 16,
13
+ "lora_alpha": 32.0,
14
+ "max_text_len": 256,
15
+ "max_speech_len": 120,
16
+ "device": "cuda",
17
+ "bf16": true,
18
+ "save_every": 500,
19
+ "eval_every": 500
20
+ }