Boovore_Multi-Engine-TTS-Studio

Running

Foradc Claude Sonnet 4.6 commited on Apr 1

Commit

737927a

1 Parent(s): f627e2c

feat: add Voxtral TTS engine + French prosody preprocessing

- Add preprocess_french() with FRENCH_ABBREVS (arXiv:2508.17494) — expands
abbreviations, guillemet pauses, em-dash spacing, ellipsis normalization
- Add FRENCH_NARRATOR_PROMPT optimized for audiobook narration
- Add /generate/voxtral endpoint (vLLM-Omni, Mistral Voxtral-4B-TTS-2603)
with auto-fallback to narrator_reference.wav for voice cloning
- Add VOXTRAL_URL env var (default: http://localhost:8000)
- Update /status to expose voxtral_url and voxtral_model
- Add Voxtral pill (FR★) in engine selector UI
- Add Voxtral panel with ref audio upload in innerPanelFR
- Add generateVoxtral() JS function + i18n labels (FR/EN)
- Wire into ENGINE_TO_ITAB, ENGINE_TO_ROW, generateCurrent() switch

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

index.html +61 -2
server.py +120 -0

index.html CHANGED Viewed

@@ -1049,6 +1049,11 @@ select option, select optgroup {
             Fish-Speech
             <span class="ep-tag" style="background:#0ea5e9">NEW</span>
           </button>
         </div>
       </div>
       <div class="engine-divider"></div>
@@ -1241,6 +1246,20 @@ select option, select optgroup {
         </button>
       </div>
     </div>
     </div><!-- /#innerPanelFR -->
     <!-- ── Panel Qwen3-TTS ── -->
@@ -1902,6 +1921,7 @@ const I18N = {
     engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
     engF5: 'F5-TTS FR — clonage vocal français',
     engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
     genBtn: 'Générer',
     bookGenBtn: 'Générer la sélection',
     bookPause: 'Pause',
@@ -1917,6 +1937,7 @@ const I18N = {
     engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
     engF5: 'F5-TTS FR — French voice cloning',
     engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
     genBtn: 'Generate',
     bookGenBtn: 'Generate selection',
     bookPause: 'Pause',
@@ -1943,6 +1964,7 @@ function setLang(lang) {
   $('engLblChatterbox').textContent = t.engChatterbox;
   $('engLblF5').textContent = t.engF5;
   $('engLblFish').textContent = t.engFish;
   // Buttons
   $('genBtnLabel').textContent = t.genBtn;
   $('bookGenBtnLabel').textContent = t.bookGenBtn;
@@ -2113,8 +2135,8 @@ function switchInnerTab(tab) {
 // ── Engine pill selector ───────────────────────────────────────────────────────
 let currentEngine = 'kokoro'; // selected studio engine
-const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
-const ENGINE_TO_ROW  = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish' };
 const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
 function selectStudioEngine(engine) {
@@ -2152,6 +2174,7 @@ async function generateCurrent() {
     case 'chatterbox': await generateChatterbox();  break;
     case 'f5':         await generateF5();           break;
     case 'fish':       await generateFish();         break;
     default: break;
   }
 }
@@ -2340,6 +2363,42 @@ async function generateFish() {
   busy = false; setPlayBtns(false);
 }
 async function loadModel() {
   const btn = $('loadBtn');
   const btnVis = $('loadBtnVis');

             Fish-Speech
             <span class="ep-tag" style="background:#0ea5e9">NEW</span>
           </button>
+          <button class="epill" id="pill-voxtral" style="--ep-color:#f59e0b" onclick="selectStudioEngine('voxtral')">
+            <span class="ep-dot" style="background:#f59e0b"></span>
+            Voxtral
+            <span class="ep-tag" style="background:#f59e0b;color:#000">FR★</span>
+          </button>
         </div>
       </div>
       <div class="engine-divider"></div>
         </button>
       </div>
     </div>
+    <!-- Voxtral TTS -->
+    <div class="mode-row" id="rowVoxtral">
+      <div class="label"><span id="engLblVoxtral">Voxtral 4B — TTS français Mistral AI (vLLM-Omni)</span><span class="switch-tag" style="background:#f59e0b;color:#000">FR★</span></div>
+      <div class="row" style="flex-wrap:wrap;gap:6px">
+        <label style="font-size:11px;color:var(--dim);flex-shrink:0">Réf audio :</label>
+        <input type="file" id="voxtralRefWav" accept="audio/*" style="font-size:11px;color:var(--dim);flex:2;min-width:120px;max-width:180px">
+        <label style="font-size:11px;color:var(--dim);flex-shrink:0">Texte réf :</label>
+        <input type="text" id="voxtralRefText" placeholder="Transcription du clip..." style="font-size:11px;flex:3;min-width:140px;background:var(--surface2);border:1px solid var(--border);border-radius:var(--radius-sm);color:var(--text);padding:4px 6px">
+      </div>
+      <div style="font-size:11px;color:var(--dim);margin-top:4px">
+        Nécessite un serveur vLLM-Omni local (<code style="font-size:10px;color:var(--accent)">python3 voxtral_server.py</code>).
+        Si aucun clip de référence, utilise <code style="font-size:10px;color:var(--accent)">narrator_reference.wav</code>.
+      </div>
+    </div>
     </div><!-- /#innerPanelFR -->
     <!-- ── Panel Qwen3-TTS ── -->
     engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
     engF5: 'F5-TTS FR — clonage vocal français',
     engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
+    engVoxtral: 'Voxtral 4B — TTS français Mistral AI (vLLM-Omni)',
     genBtn: 'Générer',
     bookGenBtn: 'Générer la sélection',
     bookPause: 'Pause',
     engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
     engF5: 'F5-TTS FR — French voice cloning',
     engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
+    engVoxtral: 'Voxtral 4B — Mistral AI French TTS (vLLM-Omni)',
     genBtn: 'Generate',
     bookGenBtn: 'Generate selection',
     bookPause: 'Pause',
   $('engLblChatterbox').textContent = t.engChatterbox;
   $('engLblF5').textContent = t.engF5;
   $('engLblFish').textContent = t.engFish;
+  $('engLblVoxtral').textContent = t.engVoxtral;
   // Buttons
   $('genBtnLabel').textContent = t.genBtn;
   $('bookGenBtnLabel').textContent = t.bookGenBtn;
 // ── Engine pill selector ───────────────────────────────────────────────────────
 let currentEngine = 'kokoro'; // selected studio engine
+const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', voxtral:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
+const ENGINE_TO_ROW  = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish', voxtral:'rowVoxtral' };
 const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
 function selectStudioEngine(engine) {
     case 'chatterbox': await generateChatterbox();  break;
     case 'f5':         await generateF5();           break;
     case 'fish':       await generateFish();         break;
+    case 'voxtral':    await generateVoxtral();      break;
     default: break;
   }
 }
   busy = false; setPlayBtns(false);
 }
+async function generateVoxtral() {
+  if (busy) return;
+  const text = $('textIn').value.trim();
+  if (!text) { showMsg('err', 'Entre du texte d\'abord.'); return; }
+  hideMsg();
+  busy = true; setPlayBtns(true);
+  $('results').classList.add('show');
+  $('player').parentElement.style.display = 'none';
+  $('waveInd').classList.remove('off');
+  $('pbarWrap').classList.add('show');
+  const pb = $('pbar'); pb.style.width = ''; pb.classList.add('spin');
+  resetMetrics();
+  const t0 = performance.now();
+  try {
+    const fd = new FormData();
+    fd.append('text', text);
+    const refFile = $('voxtralRefWav') && $('voxtralRefWav').files[0];
+    if (refFile) fd.append('ref_wav', refFile);
+    const refText = $('voxtralRefText') ? $('voxtralRefText').value.trim() : '';
+    if (refText) fd.append('ref_text', refText);
+    const res = await fetch('/generate/voxtral', { method: 'POST', body: fd });
+    if (!res.ok) { const e = await res.json(); throw new Error(e.detail || 'Erreur Voxtral'); }
+    const blob = new Blob([await res.arrayBuffer()], { type: 'audio/wav' });
+    const genSecs = (performance.now() - t0) / 1000;
+    setPlayer(blob);
+    $('player').play().catch(() => {});
+    pb.classList.remove('spin'); pb.style.width = '100%';
+    $('waveInd').classList.add('off');
+    $('mClient').textContent = genSecs.toFixed(1) + 's';
+  } catch(e) {
+    showMsg('err', e.message);
+    pb.classList.remove('spin'); pb.style.width = '0%';
+  }
+  busy = false; setPlayBtns(false);
+}
 async function loadModel() {
   const btn = $('loadBtn');
   const btnVis = $('loadBtnVis');

server.py CHANGED Viewed

@@ -17,6 +17,7 @@ import hashlib
 import io
 import json
 import os
 import sys
 import tempfile
 import threading
@@ -32,6 +33,10 @@ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
 # ── Fish-Speech ───────────────────────────────────────────────────────────────
 FISH_SPEECH_REPO = Path("/tmp/fish-speech")
 FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
@@ -294,6 +299,58 @@ _AUDIO_TOO_LARGE_MSG = (
 )
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
@@ -379,6 +436,8 @@ async def get_status():
         "queue_depth": _generation_waiters,
         "cached_models": list(_model_cache.keys()),
         "kokoro_voices": KOKORO_VOICES_FR,
     }
@@ -971,6 +1030,67 @@ async def generate_fish(
             os.unlink(prev_path)
 # ─── Entry point ──────────────────────────────────────────────────────────────
 def main():

 import io
 import json
 import os
+import re
 import sys
 import tempfile
 import threading
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
+# ── Voxtral TTS (vLLM-Omni, Mistral AI) ──────────────────────────────────────
+_VOXTRAL_URL   = os.environ.get("VOXTRAL_URL", "http://localhost:8000")
+_VOXTRAL_MODEL = "mistralai/Voxtral-4B-TTS-2603"
 # ── Fish-Speech ───────────────────────────────────────────────────────────────
 FISH_SPEECH_REPO = Path("/tmp/fish-speech")
 FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
 )
+# ─── French TTS preprocessing ─────────────────────────────────────────────────
+FRENCH_ABBREVS = [
+    (r'\bM\.\s+',              'Monsieur '),
+    (r'\bMme\.?\s+',           'Madame '),
+    (r'\bMlle\.?\s+',          'Mademoiselle '),
+    (r'\bDr\.?\s+',            'Docteur '),
+    (r'\bPr\.?\s+',            'Professeur '),
+    (r'\bSt\.?\s+',            'Saint '),
+    (r'\betc\.(?!\w)',          'et cetera'),
+    (r'\bn°\s*(\d+)',           r'numéro \1'),
+    (r'\b(\d{1,2})\s*h\s*(\d{2})\b', r'\1 heures \2'),
+    (r'\b(\d{1,2})\s*h\b',     r'\1 heures'),
+    (r'\bp\.\s*(\d+)\b',       r'page \1'),
+]
+FRENCH_NARRATOR_PROMPT = (
+    "Narrateur professionnel de livres audio français, voix grave, chaude et captivante. "
+    "Débit naturellement mesuré — ni précipité ni traînant. "
+    "Respectez scrupuleusement la ponctuation : "
+    "légère pause aux virgules, souffle marqué aux points, "
+    "pause longue et respirée aux doubles sauts de paragraphe. "
+    "Aux guillemets « », adoptez un registre légèrement plus direct et personnel pour le dialogue, "
+    "puis revenez au ton narratif après le ». "
+    "Les points de suspension (...) et les tirets (—) appellent une vraie pause respiratoire. "
+    "Ton légèrement plus grave et plus riche que la conversation ordinaire, "
+    "comme un conteur au coin du feu. "
+    "Restez cohérent du début à la fin — même timbre, même rythme de fond."
+)
+def preprocess_french(text: str) -> str:
+    """French TTS prosody preprocessing (arXiv:2508.17494).
+    Expands abbreviations and inserts natural pause markers for the TTS model.
+    """
+    for pattern, repl in FRENCH_ABBREVS:
+        text = re.sub(pattern, repl, text)
+    # Guillemet spacing
+    text = re.sub(r'«\s*', '« ', text)
+    text = re.sub(r'\s*»', ' »', text)
+    # Paragraph breaks → strong pause
+    text = re.sub(r'\n\s*\n', ' ... ', text)
+    # Em-dash → spaced pause
+    text = re.sub(r'\s*—\s*', ' — ', text)
+    # Normalize ellipsis
+    text = re.sub(r'\.{3,}', '...', text)
+    # Normalize whitespace first, then add post-sentence double space
+    text = re.sub(r'[ \t]+', ' ', text)
+    text = re.sub(r'([!?])', r'\1 ', text)
+    return text.strip()
 # ─── Helpers ──────────────────────────────────────────────────────────────────
 def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
         "queue_depth": _generation_waiters,
         "cached_models": list(_model_cache.keys()),
         "kokoro_voices": KOKORO_VOICES_FR,
+        "voxtral_url": _VOXTRAL_URL,
+        "voxtral_model": _VOXTRAL_MODEL,
     }
             os.unlink(prev_path)
+@app.post("/generate/voxtral")
+async def generate_voxtral(
+    text: str = Form(...),
+    ref_wav: UploadFile = File(None),
+    ref_text: str = Form(""),
+):
+    """Generate speech via Voxtral TTS (vLLM-Omni server at VOXTRAL_URL).
+    Requires a running `python3 voxtral_server.py` instance.
+    Optionally upload a reference WAV for voice cloning; falls back to
+    narrator_reference.wav in the same directory as this script.
+    """
+    if not _engine_enabled("voxtral"):
+        raise HTTPException(status_code=503, detail="Voxtral engine not enabled on this server.")
+    try:
+        import httpx as _httpx
+    except ImportError:
+        raise HTTPException(status_code=503, detail="httpx not installed. Run: pip install httpx")
+    # Preprocess French text for better prosody
+    processed = preprocess_french(text)
+    ref_b64: str | None = None
+    if ref_wav and ref_wav.filename:
+        ref_bytes = await ref_wav.read()
+        if len(ref_bytes) > MAX_AUDIO_BYTES:
+            raise HTTPException(status_code=400, detail=_AUDIO_TOO_LARGE_MSG.format(size_mb=len(ref_bytes) / 1024 / 1024))
+        ref_b64 = "data:audio/wav;base64," + base64.b64encode(ref_bytes).decode()
+    else:
+        narrator_ref = Path(__file__).parent / "narrator_reference.wav"
+        if narrator_ref.exists():
+            ref_b64 = "data:audio/wav;base64," + base64.b64encode(narrator_ref.read_bytes()).decode()
+    payload: dict = {
+        "model": _VOXTRAL_MODEL,
+        "input": processed,
+        "response_format": "wav",
+    }
+    if ref_b64:
+        payload["ref_audio"] = ref_b64
+        payload["ref_text"] = ref_text
+    def _run():
+        try:
+            r = _httpx.post(f"{_VOXTRAL_URL}/v1/audio/speech", json=payload, timeout=120.0)
+            r.raise_for_status()
+            return r.content
+        except _httpx.ConnectError:
+            raise RuntimeError(
+                f"Voxtral server not reachable at {_VOXTRAL_URL}. "
+                "Start it with: python3 voxtral_server.py"
+            )
+    try:
+        wav_bytes = await asyncio.to_thread(_run)
+        return Response(content=wav_bytes, media_type="audio/wav")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
 # ─── Entry point ──────────────────────────────────────────────────────────────
 def main():