Foradc Claude Sonnet 4.6 commited on
Commit
737927a
·
1 Parent(s): f627e2c

feat: add Voxtral TTS engine + French prosody preprocessing

Browse files

- Add preprocess_french() with FRENCH_ABBREVS (arXiv:2508.17494) — expands
abbreviations, guillemet pauses, em-dash spacing, ellipsis normalization
- Add FRENCH_NARRATOR_PROMPT optimized for audiobook narration
- Add /generate/voxtral endpoint (vLLM-Omni, Mistral Voxtral-4B-TTS-2603)
with auto-fallback to narrator_reference.wav for voice cloning
- Add VOXTRAL_URL env var (default: http://localhost:8000)
- Update /status to expose voxtral_url and voxtral_model
- Add Voxtral pill (FR★) in engine selector UI
- Add Voxtral panel with ref audio upload in innerPanelFR
- Add generateVoxtral() JS function + i18n labels (FR/EN)
- Wire into ENGINE_TO_ITAB, ENGINE_TO_ROW, generateCurrent() switch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. index.html +61 -2
  2. server.py +120 -0
index.html CHANGED
@@ -1049,6 +1049,11 @@ select option, select optgroup {
1049
  Fish-Speech
1050
  <span class="ep-tag" style="background:#0ea5e9">NEW</span>
1051
  </button>
 
 
 
 
 
1052
  </div>
1053
  </div>
1054
  <div class="engine-divider"></div>
@@ -1241,6 +1246,20 @@ select option, select optgroup {
1241
  </button>
1242
  </div>
1243
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1244
  </div><!-- /#innerPanelFR -->
1245
 
1246
  <!-- ── Panel Qwen3-TTS ── -->
@@ -1902,6 +1921,7 @@ const I18N = {
1902
  engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
1903
  engF5: 'F5-TTS FR — clonage vocal français',
1904
  engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
 
1905
  genBtn: 'Générer',
1906
  bookGenBtn: 'Générer la sélection',
1907
  bookPause: 'Pause',
@@ -1917,6 +1937,7 @@ const I18N = {
1917
  engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
1918
  engF5: 'F5-TTS FR — French voice cloning',
1919
  engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
 
1920
  genBtn: 'Generate',
1921
  bookGenBtn: 'Generate selection',
1922
  bookPause: 'Pause',
@@ -1943,6 +1964,7 @@ function setLang(lang) {
1943
  $('engLblChatterbox').textContent = t.engChatterbox;
1944
  $('engLblF5').textContent = t.engF5;
1945
  $('engLblFish').textContent = t.engFish;
 
1946
  // Buttons
1947
  $('genBtnLabel').textContent = t.genBtn;
1948
  $('bookGenBtnLabel').textContent = t.bookGenBtn;
@@ -2113,8 +2135,8 @@ function switchInnerTab(tab) {
2113
  // ── Engine pill selector ───────────────────────────────────────────────────────
2114
  let currentEngine = 'kokoro'; // selected studio engine
2115
 
2116
- const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
2117
- const ENGINE_TO_ROW = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish' };
2118
  const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
2119
 
2120
  function selectStudioEngine(engine) {
@@ -2152,6 +2174,7 @@ async function generateCurrent() {
2152
  case 'chatterbox': await generateChatterbox(); break;
2153
  case 'f5': await generateF5(); break;
2154
  case 'fish': await generateFish(); break;
 
2155
  default: break;
2156
  }
2157
  }
@@ -2340,6 +2363,42 @@ async function generateFish() {
2340
  busy = false; setPlayBtns(false);
2341
  }
2342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2343
  async function loadModel() {
2344
  const btn = $('loadBtn');
2345
  const btnVis = $('loadBtnVis');
 
1049
  Fish-Speech
1050
  <span class="ep-tag" style="background:#0ea5e9">NEW</span>
1051
  </button>
1052
+ <button class="epill" id="pill-voxtral" style="--ep-color:#f59e0b" onclick="selectStudioEngine('voxtral')">
1053
+ <span class="ep-dot" style="background:#f59e0b"></span>
1054
+ Voxtral
1055
+ <span class="ep-tag" style="background:#f59e0b;color:#000">FR★</span>
1056
+ </button>
1057
  </div>
1058
  </div>
1059
  <div class="engine-divider"></div>
 
1246
  </button>
1247
  </div>
1248
  </div>
1249
+ <!-- Voxtral TTS -->
1250
+ <div class="mode-row" id="rowVoxtral">
1251
+ <div class="label"><span id="engLblVoxtral">Voxtral 4B — TTS français Mistral AI (vLLM-Omni)</span><span class="switch-tag" style="background:#f59e0b;color:#000">FR★</span></div>
1252
+ <div class="row" style="flex-wrap:wrap;gap:6px">
1253
+ <label style="font-size:11px;color:var(--dim);flex-shrink:0">Réf audio :</label>
1254
+ <input type="file" id="voxtralRefWav" accept="audio/*" style="font-size:11px;color:var(--dim);flex:2;min-width:120px;max-width:180px">
1255
+ <label style="font-size:11px;color:var(--dim);flex-shrink:0">Texte réf :</label>
1256
+ <input type="text" id="voxtralRefText" placeholder="Transcription du clip..." style="font-size:11px;flex:3;min-width:140px;background:var(--surface2);border:1px solid var(--border);border-radius:var(--radius-sm);color:var(--text);padding:4px 6px">
1257
+ </div>
1258
+ <div style="font-size:11px;color:var(--dim);margin-top:4px">
1259
+ Nécessite un serveur vLLM-Omni local (<code style="font-size:10px;color:var(--accent)">python3 voxtral_server.py</code>).
1260
+ Si aucun clip de référence, utilise <code style="font-size:10px;color:var(--accent)">narrator_reference.wav</code>.
1261
+ </div>
1262
+ </div>
1263
  </div><!-- /#innerPanelFR -->
1264
 
1265
  <!-- ── Panel Qwen3-TTS ── -->
 
1921
  engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
1922
  engF5: 'F5-TTS FR — clonage vocal français',
1923
  engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
1924
+ engVoxtral: 'Voxtral 4B — TTS français Mistral AI (vLLM-Omni)',
1925
  genBtn: 'Générer',
1926
  bookGenBtn: 'Générer la sélection',
1927
  bookPause: 'Pause',
 
1937
  engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
1938
  engF5: 'F5-TTS FR — French voice cloning',
1939
  engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
1940
+ engVoxtral: 'Voxtral 4B — Mistral AI French TTS (vLLM-Omni)',
1941
  genBtn: 'Generate',
1942
  bookGenBtn: 'Generate selection',
1943
  bookPause: 'Pause',
 
1964
  $('engLblChatterbox').textContent = t.engChatterbox;
1965
  $('engLblF5').textContent = t.engF5;
1966
  $('engLblFish').textContent = t.engFish;
1967
+ $('engLblVoxtral').textContent = t.engVoxtral;
1968
  // Buttons
1969
  $('genBtnLabel').textContent = t.genBtn;
1970
  $('bookGenBtnLabel').textContent = t.bookGenBtn;
 
2135
  // ── Engine pill selector ───────────────────────────────────────────────────────
2136
  let currentEngine = 'kokoro'; // selected studio engine
2137
 
2138
+ const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', voxtral:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
2139
+ const ENGINE_TO_ROW = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish', voxtral:'rowVoxtral' };
2140
  const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
2141
 
2142
  function selectStudioEngine(engine) {
 
2174
  case 'chatterbox': await generateChatterbox(); break;
2175
  case 'f5': await generateF5(); break;
2176
  case 'fish': await generateFish(); break;
2177
+ case 'voxtral': await generateVoxtral(); break;
2178
  default: break;
2179
  }
2180
  }
 
2363
  busy = false; setPlayBtns(false);
2364
  }
2365
 
2366
+ async function generateVoxtral() {
2367
+ if (busy) return;
2368
+ const text = $('textIn').value.trim();
2369
+ if (!text) { showMsg('err', 'Entre du texte d\'abord.'); return; }
2370
+ hideMsg();
2371
+ busy = true; setPlayBtns(true);
2372
+ $('results').classList.add('show');
2373
+ $('player').parentElement.style.display = 'none';
2374
+ $('waveInd').classList.remove('off');
2375
+ $('pbarWrap').classList.add('show');
2376
+ const pb = $('pbar'); pb.style.width = ''; pb.classList.add('spin');
2377
+ resetMetrics();
2378
+ const t0 = performance.now();
2379
+ try {
2380
+ const fd = new FormData();
2381
+ fd.append('text', text);
2382
+ const refFile = $('voxtralRefWav') && $('voxtralRefWav').files[0];
2383
+ if (refFile) fd.append('ref_wav', refFile);
2384
+ const refText = $('voxtralRefText') ? $('voxtralRefText').value.trim() : '';
2385
+ if (refText) fd.append('ref_text', refText);
2386
+ const res = await fetch('/generate/voxtral', { method: 'POST', body: fd });
2387
+ if (!res.ok) { const e = await res.json(); throw new Error(e.detail || 'Erreur Voxtral'); }
2388
+ const blob = new Blob([await res.arrayBuffer()], { type: 'audio/wav' });
2389
+ const genSecs = (performance.now() - t0) / 1000;
2390
+ setPlayer(blob);
2391
+ $('player').play().catch(() => {});
2392
+ pb.classList.remove('spin'); pb.style.width = '100%';
2393
+ $('waveInd').classList.add('off');
2394
+ $('mClient').textContent = genSecs.toFixed(1) + 's';
2395
+ } catch(e) {
2396
+ showMsg('err', e.message);
2397
+ pb.classList.remove('spin'); pb.style.width = '0%';
2398
+ }
2399
+ busy = false; setPlayBtns(false);
2400
+ }
2401
+
2402
  async function loadModel() {
2403
  const btn = $('loadBtn');
2404
  const btnVis = $('loadBtnVis');
server.py CHANGED
@@ -17,6 +17,7 @@ import hashlib
17
  import io
18
  import json
19
  import os
 
20
  import sys
21
  import tempfile
22
  import threading
@@ -32,6 +33,10 @@ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
32
  from fastapi.middleware.cors import CORSMiddleware
33
  from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
34
 
 
 
 
 
35
  # ── Fish-Speech ───────────────────────────────────────────────────────────────
36
  FISH_SPEECH_REPO = Path("/tmp/fish-speech")
37
  FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
@@ -294,6 +299,58 @@ _AUDIO_TOO_LARGE_MSG = (
294
  )
295
 
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  # ─── Helpers ──────────────────────────────────────────────────────────────────
298
 
299
  def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
@@ -379,6 +436,8 @@ async def get_status():
379
  "queue_depth": _generation_waiters,
380
  "cached_models": list(_model_cache.keys()),
381
  "kokoro_voices": KOKORO_VOICES_FR,
 
 
382
  }
383
 
384
 
@@ -971,6 +1030,67 @@ async def generate_fish(
971
  os.unlink(prev_path)
972
 
973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
  # ─── Entry point ──────────────────────────────────────────────────────────────
975
 
976
  def main():
 
17
  import io
18
  import json
19
  import os
20
+ import re
21
  import sys
22
  import tempfile
23
  import threading
 
33
  from fastapi.middleware.cors import CORSMiddleware
34
  from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
35
 
36
+ # ── Voxtral TTS (vLLM-Omni, Mistral AI) ──────────────────────────────────────
37
+ _VOXTRAL_URL = os.environ.get("VOXTRAL_URL", "http://localhost:8000")
38
+ _VOXTRAL_MODEL = "mistralai/Voxtral-4B-TTS-2603"
39
+
40
  # ── Fish-Speech ───────────────────────────────────────────────────────────────
41
  FISH_SPEECH_REPO = Path("/tmp/fish-speech")
42
  FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
 
299
  )
300
 
301
 
302
+ # ─── French TTS preprocessing ─────────────────────────────────────────────────
303
+
304
+ FRENCH_ABBREVS = [
305
+ (r'\bM\.\s+', 'Monsieur '),
306
+ (r'\bMme\.?\s+', 'Madame '),
307
+ (r'\bMlle\.?\s+', 'Mademoiselle '),
308
+ (r'\bDr\.?\s+', 'Docteur '),
309
+ (r'\bPr\.?\s+', 'Professeur '),
310
+ (r'\bSt\.?\s+', 'Saint '),
311
+ (r'\betc\.(?!\w)', 'et cetera'),
312
+ (r'\bn°\s*(\d+)', r'numéro \1'),
313
+ (r'\b(\d{1,2})\s*h\s*(\d{2})\b', r'\1 heures \2'),
314
+ (r'\b(\d{1,2})\s*h\b', r'\1 heures'),
315
+ (r'\bp\.\s*(\d+)\b', r'page \1'),
316
+ ]
317
+
318
+ FRENCH_NARRATOR_PROMPT = (
319
+ "Narrateur professionnel de livres audio français, voix grave, chaude et captivante. "
320
+ "Débit naturellement mesuré — ni précipité ni traînant. "
321
+ "Respectez scrupuleusement la ponctuation : "
322
+ "légère pause aux virgules, souffle marqué aux points, "
323
+ "pause longue et respirée aux doubles sauts de paragraphe. "
324
+ "Aux guillemets « », adoptez un registre légèrement plus direct et personnel pour le dialogue, "
325
+ "puis revenez au ton narratif après le ». "
326
+ "Les points de suspension (...) et les tirets (—) appellent une vraie pause respiratoire. "
327
+ "Ton légèrement plus grave et plus riche que la conversation ordinaire, "
328
+ "comme un conteur au coin du feu. "
329
+ "Restez cohérent du début à la fin — même timbre, même rythme de fond."
330
+ )
331
+
332
+
333
+ def preprocess_french(text: str) -> str:
334
+ """French TTS prosody preprocessing (arXiv:2508.17494).
335
+ Expands abbreviations and inserts natural pause markers for the TTS model.
336
+ """
337
+ for pattern, repl in FRENCH_ABBREVS:
338
+ text = re.sub(pattern, repl, text)
339
+ # Guillemet spacing
340
+ text = re.sub(r'«\s*', '« ', text)
341
+ text = re.sub(r'\s*»', ' »', text)
342
+ # Paragraph breaks → strong pause
343
+ text = re.sub(r'\n\s*\n', ' ... ', text)
344
+ # Em-dash → spaced pause
345
+ text = re.sub(r'\s*—\s*', ' — ', text)
346
+ # Normalize ellipsis
347
+ text = re.sub(r'\.{3,}', '...', text)
348
+ # Normalize whitespace first, then add post-sentence double space
349
+ text = re.sub(r'[ \t]+', ' ', text)
350
+ text = re.sub(r'([!?])', r'\1 ', text)
351
+ return text.strip()
352
+
353
+
354
  # ─── Helpers ──────────────────────────────────────────────────────────────────
355
 
356
  def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
 
436
  "queue_depth": _generation_waiters,
437
  "cached_models": list(_model_cache.keys()),
438
  "kokoro_voices": KOKORO_VOICES_FR,
439
+ "voxtral_url": _VOXTRAL_URL,
440
+ "voxtral_model": _VOXTRAL_MODEL,
441
  }
442
 
443
 
 
1030
  os.unlink(prev_path)
1031
 
1032
 
1033
+ @app.post("/generate/voxtral")
1034
+ async def generate_voxtral(
1035
+ text: str = Form(...),
1036
+ ref_wav: UploadFile = File(None),
1037
+ ref_text: str = Form(""),
1038
+ ):
1039
+ """Generate speech via Voxtral TTS (vLLM-Omni server at VOXTRAL_URL).
1040
+
1041
+ Requires a running `python3 voxtral_server.py` instance.
1042
+ Optionally upload a reference WAV for voice cloning; falls back to
1043
+ narrator_reference.wav in the same directory as this script.
1044
+ """
1045
+ if not _engine_enabled("voxtral"):
1046
+ raise HTTPException(status_code=503, detail="Voxtral engine not enabled on this server.")
1047
+
1048
+ try:
1049
+ import httpx as _httpx
1050
+ except ImportError:
1051
+ raise HTTPException(status_code=503, detail="httpx not installed. Run: pip install httpx")
1052
+
1053
+ # Preprocess French text for better prosody
1054
+ processed = preprocess_french(text)
1055
+
1056
+ ref_b64: str | None = None
1057
+ if ref_wav and ref_wav.filename:
1058
+ ref_bytes = await ref_wav.read()
1059
+ if len(ref_bytes) > MAX_AUDIO_BYTES:
1060
+ raise HTTPException(status_code=400, detail=_AUDIO_TOO_LARGE_MSG.format(size_mb=len(ref_bytes) / 1024 / 1024))
1061
+ ref_b64 = "data:audio/wav;base64," + base64.b64encode(ref_bytes).decode()
1062
+ else:
1063
+ narrator_ref = Path(__file__).parent / "narrator_reference.wav"
1064
+ if narrator_ref.exists():
1065
+ ref_b64 = "data:audio/wav;base64," + base64.b64encode(narrator_ref.read_bytes()).decode()
1066
+
1067
+ payload: dict = {
1068
+ "model": _VOXTRAL_MODEL,
1069
+ "input": processed,
1070
+ "response_format": "wav",
1071
+ }
1072
+ if ref_b64:
1073
+ payload["ref_audio"] = ref_b64
1074
+ payload["ref_text"] = ref_text
1075
+
1076
+ def _run():
1077
+ try:
1078
+ r = _httpx.post(f"{_VOXTRAL_URL}/v1/audio/speech", json=payload, timeout=120.0)
1079
+ r.raise_for_status()
1080
+ return r.content
1081
+ except _httpx.ConnectError:
1082
+ raise RuntimeError(
1083
+ f"Voxtral server not reachable at {_VOXTRAL_URL}. "
1084
+ "Start it with: python3 voxtral_server.py"
1085
+ )
1086
+
1087
+ try:
1088
+ wav_bytes = await asyncio.to_thread(_run)
1089
+ return Response(content=wav_bytes, media_type="audio/wav")
1090
+ except Exception as e:
1091
+ raise HTTPException(status_code=500, detail=str(e))
1092
+
1093
+
1094
  # ─── Entry point ──────────────────────────────────────────────────────────────
1095
 
1096
  def main():