Foradc Claude Sonnet 4.6 commited on
Commit ·
737927a
1
Parent(s): f627e2c
feat: add Voxtral TTS engine + French prosody preprocessing
Browse files- Add preprocess_french() with FRENCH_ABBREVS (arXiv:2508.17494) — expands
abbreviations, guillemet pauses, em-dash spacing, ellipsis normalization
- Add FRENCH_NARRATOR_PROMPT optimized for audiobook narration
- Add /generate/voxtral endpoint (vLLM-Omni, Mistral Voxtral-4B-TTS-2603)
with auto-fallback to narrator_reference.wav for voice cloning
- Add VOXTRAL_URL env var (default: http://localhost:8000)
- Update /status to expose voxtral_url and voxtral_model
- Add Voxtral pill (FR★) in engine selector UI
- Add Voxtral panel with ref audio upload in innerPanelFR
- Add generateVoxtral() JS function + i18n labels (FR/EN)
- Wire into ENGINE_TO_ITAB, ENGINE_TO_ROW, generateCurrent() switch
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- index.html +61 -2
- server.py +120 -0
index.html
CHANGED
|
@@ -1049,6 +1049,11 @@ select option, select optgroup {
|
|
| 1049 |
Fish-Speech
|
| 1050 |
<span class="ep-tag" style="background:#0ea5e9">NEW</span>
|
| 1051 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1052 |
</div>
|
| 1053 |
</div>
|
| 1054 |
<div class="engine-divider"></div>
|
|
@@ -1241,6 +1246,20 @@ select option, select optgroup {
|
|
| 1241 |
</button>
|
| 1242 |
</div>
|
| 1243 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1244 |
</div><!-- /#innerPanelFR -->
|
| 1245 |
|
| 1246 |
<!-- ── Panel Qwen3-TTS ── -->
|
|
@@ -1902,6 +1921,7 @@ const I18N = {
|
|
| 1902 |
engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
|
| 1903 |
engF5: 'F5-TTS FR — clonage vocal français',
|
| 1904 |
engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
|
|
|
|
| 1905 |
genBtn: 'Générer',
|
| 1906 |
bookGenBtn: 'Générer la sélection',
|
| 1907 |
bookPause: 'Pause',
|
|
@@ -1917,6 +1937,7 @@ const I18N = {
|
|
| 1917 |
engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
|
| 1918 |
engF5: 'F5-TTS FR — French voice cloning',
|
| 1919 |
engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
|
|
|
|
| 1920 |
genBtn: 'Generate',
|
| 1921 |
bookGenBtn: 'Generate selection',
|
| 1922 |
bookPause: 'Pause',
|
|
@@ -1943,6 +1964,7 @@ function setLang(lang) {
|
|
| 1943 |
$('engLblChatterbox').textContent = t.engChatterbox;
|
| 1944 |
$('engLblF5').textContent = t.engF5;
|
| 1945 |
$('engLblFish').textContent = t.engFish;
|
|
|
|
| 1946 |
// Buttons
|
| 1947 |
$('genBtnLabel').textContent = t.genBtn;
|
| 1948 |
$('bookGenBtnLabel').textContent = t.bookGenBtn;
|
|
@@ -2113,8 +2135,8 @@ function switchInnerTab(tab) {
|
|
| 2113 |
// ── Engine pill selector ───────────────────────────────────────────────────────
|
| 2114 |
let currentEngine = 'kokoro'; // selected studio engine
|
| 2115 |
|
| 2116 |
-
const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
|
| 2117 |
-
const ENGINE_TO_ROW = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish' };
|
| 2118 |
const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
|
| 2119 |
|
| 2120 |
function selectStudioEngine(engine) {
|
|
@@ -2152,6 +2174,7 @@ async function generateCurrent() {
|
|
| 2152 |
case 'chatterbox': await generateChatterbox(); break;
|
| 2153 |
case 'f5': await generateF5(); break;
|
| 2154 |
case 'fish': await generateFish(); break;
|
|
|
|
| 2155 |
default: break;
|
| 2156 |
}
|
| 2157 |
}
|
|
@@ -2340,6 +2363,42 @@ async function generateFish() {
|
|
| 2340 |
busy = false; setPlayBtns(false);
|
| 2341 |
}
|
| 2342 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2343 |
async function loadModel() {
|
| 2344 |
const btn = $('loadBtn');
|
| 2345 |
const btnVis = $('loadBtnVis');
|
|
|
|
| 1049 |
Fish-Speech
|
| 1050 |
<span class="ep-tag" style="background:#0ea5e9">NEW</span>
|
| 1051 |
</button>
|
| 1052 |
+
<button class="epill" id="pill-voxtral" style="--ep-color:#f59e0b" onclick="selectStudioEngine('voxtral')">
|
| 1053 |
+
<span class="ep-dot" style="background:#f59e0b"></span>
|
| 1054 |
+
Voxtral
|
| 1055 |
+
<span class="ep-tag" style="background:#f59e0b;color:#000">FR★</span>
|
| 1056 |
+
</button>
|
| 1057 |
</div>
|
| 1058 |
</div>
|
| 1059 |
<div class="engine-divider"></div>
|
|
|
|
| 1246 |
</button>
|
| 1247 |
</div>
|
| 1248 |
</div>
|
| 1249 |
+
<!-- Voxtral TTS -->
|
| 1250 |
+
<div class="mode-row" id="rowVoxtral">
|
| 1251 |
+
<div class="label"><span id="engLblVoxtral">Voxtral 4B — TTS français Mistral AI (vLLM-Omni)</span><span class="switch-tag" style="background:#f59e0b;color:#000">FR★</span></div>
|
| 1252 |
+
<div class="row" style="flex-wrap:wrap;gap:6px">
|
| 1253 |
+
<label style="font-size:11px;color:var(--dim);flex-shrink:0">Réf audio :</label>
|
| 1254 |
+
<input type="file" id="voxtralRefWav" accept="audio/*" style="font-size:11px;color:var(--dim);flex:2;min-width:120px;max-width:180px">
|
| 1255 |
+
<label style="font-size:11px;color:var(--dim);flex-shrink:0">Texte réf :</label>
|
| 1256 |
+
<input type="text" id="voxtralRefText" placeholder="Transcription du clip..." style="font-size:11px;flex:3;min-width:140px;background:var(--surface2);border:1px solid var(--border);border-radius:var(--radius-sm);color:var(--text);padding:4px 6px">
|
| 1257 |
+
</div>
|
| 1258 |
+
<div style="font-size:11px;color:var(--dim);margin-top:4px">
|
| 1259 |
+
Nécessite un serveur vLLM-Omni local (<code style="font-size:10px;color:var(--accent)">python3 voxtral_server.py</code>).
|
| 1260 |
+
Si aucun clip de référence, utilise <code style="font-size:10px;color:var(--accent)">narrator_reference.wav</code>.
|
| 1261 |
+
</div>
|
| 1262 |
+
</div>
|
| 1263 |
</div><!-- /#innerPanelFR -->
|
| 1264 |
|
| 1265 |
<!-- ── Panel Qwen3-TTS ── -->
|
|
|
|
| 1921 |
engChatterbox: 'Chatterbox — clonage zero-shot (ResembleAI)',
|
| 1922 |
engF5: 'F5-TTS FR — clonage vocal français',
|
| 1923 |
engFish: 'Fish-Speech 1.5 — clonage vocal multilingue',
|
| 1924 |
+
engVoxtral: 'Voxtral 4B — TTS français Mistral AI (vLLM-Omni)',
|
| 1925 |
genBtn: 'Générer',
|
| 1926 |
bookGenBtn: 'Générer la sélection',
|
| 1927 |
bookPause: 'Pause',
|
|
|
|
| 1937 |
engChatterbox: 'Chatterbox — zero-shot cloning (ResembleAI)',
|
| 1938 |
engF5: 'F5-TTS FR — French voice cloning',
|
| 1939 |
engFish: 'Fish-Speech 1.5 — multilingual voice cloning',
|
| 1940 |
+
engVoxtral: 'Voxtral 4B — Mistral AI French TTS (vLLM-Omni)',
|
| 1941 |
genBtn: 'Generate',
|
| 1942 |
bookGenBtn: 'Generate selection',
|
| 1943 |
bookPause: 'Pause',
|
|
|
|
| 1964 |
$('engLblChatterbox').textContent = t.engChatterbox;
|
| 1965 |
$('engLblF5').textContent = t.engF5;
|
| 1966 |
$('engLblFish').textContent = t.engFish;
|
| 1967 |
+
$('engLblVoxtral').textContent = t.engVoxtral;
|
| 1968 |
// Buttons
|
| 1969 |
$('genBtnLabel').textContent = t.genBtn;
|
| 1970 |
$('bookGenBtnLabel').textContent = t.bookGenBtn;
|
|
|
|
| 2135 |
// ── Engine pill selector ───────────────────────────────────────────────────────
|
| 2136 |
let currentEngine = 'kokoro'; // selected studio engine
|
| 2137 |
|
| 2138 |
+
const ENGINE_TO_ITAB = { kokoro:'fr', chatterbox:'fr', f5:'fr', fish:'fr', voxtral:'fr', clone:'qwen', custom:'qwen', design:'qwen' };
|
| 2139 |
+
const ENGINE_TO_ROW = { kokoro:'rowKokoro', chatterbox:'rowChatterbox', f5:'rowF5', fish:'rowFish', voxtral:'rowVoxtral' };
|
| 2140 |
const ENGINE_TO_QWEN = { clone:'voice_clone', custom:'custom', design:'voice_design' };
|
| 2141 |
|
| 2142 |
function selectStudioEngine(engine) {
|
|
|
|
| 2174 |
case 'chatterbox': await generateChatterbox(); break;
|
| 2175 |
case 'f5': await generateF5(); break;
|
| 2176 |
case 'fish': await generateFish(); break;
|
| 2177 |
+
case 'voxtral': await generateVoxtral(); break;
|
| 2178 |
default: break;
|
| 2179 |
}
|
| 2180 |
}
|
|
|
|
| 2363 |
busy = false; setPlayBtns(false);
|
| 2364 |
}
|
| 2365 |
|
| 2366 |
+
async function generateVoxtral() {
|
| 2367 |
+
if (busy) return;
|
| 2368 |
+
const text = $('textIn').value.trim();
|
| 2369 |
+
if (!text) { showMsg('err', 'Entre du texte d\'abord.'); return; }
|
| 2370 |
+
hideMsg();
|
| 2371 |
+
busy = true; setPlayBtns(true);
|
| 2372 |
+
$('results').classList.add('show');
|
| 2373 |
+
$('player').parentElement.style.display = 'none';
|
| 2374 |
+
$('waveInd').classList.remove('off');
|
| 2375 |
+
$('pbarWrap').classList.add('show');
|
| 2376 |
+
const pb = $('pbar'); pb.style.width = ''; pb.classList.add('spin');
|
| 2377 |
+
resetMetrics();
|
| 2378 |
+
const t0 = performance.now();
|
| 2379 |
+
try {
|
| 2380 |
+
const fd = new FormData();
|
| 2381 |
+
fd.append('text', text);
|
| 2382 |
+
const refFile = $('voxtralRefWav') && $('voxtralRefWav').files[0];
|
| 2383 |
+
if (refFile) fd.append('ref_wav', refFile);
|
| 2384 |
+
const refText = $('voxtralRefText') ? $('voxtralRefText').value.trim() : '';
|
| 2385 |
+
if (refText) fd.append('ref_text', refText);
|
| 2386 |
+
const res = await fetch('/generate/voxtral', { method: 'POST', body: fd });
|
| 2387 |
+
if (!res.ok) { const e = await res.json(); throw new Error(e.detail || 'Erreur Voxtral'); }
|
| 2388 |
+
const blob = new Blob([await res.arrayBuffer()], { type: 'audio/wav' });
|
| 2389 |
+
const genSecs = (performance.now() - t0) / 1000;
|
| 2390 |
+
setPlayer(blob);
|
| 2391 |
+
$('player').play().catch(() => {});
|
| 2392 |
+
pb.classList.remove('spin'); pb.style.width = '100%';
|
| 2393 |
+
$('waveInd').classList.add('off');
|
| 2394 |
+
$('mClient').textContent = genSecs.toFixed(1) + 's';
|
| 2395 |
+
} catch(e) {
|
| 2396 |
+
showMsg('err', e.message);
|
| 2397 |
+
pb.classList.remove('spin'); pb.style.width = '0%';
|
| 2398 |
+
}
|
| 2399 |
+
busy = false; setPlayBtns(false);
|
| 2400 |
+
}
|
| 2401 |
+
|
| 2402 |
async function loadModel() {
|
| 2403 |
const btn = $('loadBtn');
|
| 2404 |
const btnVis = $('loadBtnVis');
|
server.py
CHANGED
|
@@ -17,6 +17,7 @@ import hashlib
|
|
| 17 |
import io
|
| 18 |
import json
|
| 19 |
import os
|
|
|
|
| 20 |
import sys
|
| 21 |
import tempfile
|
| 22 |
import threading
|
|
@@ -32,6 +33,10 @@ from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
|
| 32 |
from fastapi.middleware.cors import CORSMiddleware
|
| 33 |
from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
# ── Fish-Speech ───────────────────────────────────────────────────────────────
|
| 36 |
FISH_SPEECH_REPO = Path("/tmp/fish-speech")
|
| 37 |
FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
|
|
@@ -294,6 +299,58 @@ _AUDIO_TOO_LARGE_MSG = (
|
|
| 294 |
)
|
| 295 |
|
| 296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 297 |
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
| 298 |
|
| 299 |
def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
|
|
@@ -379,6 +436,8 @@ async def get_status():
|
|
| 379 |
"queue_depth": _generation_waiters,
|
| 380 |
"cached_models": list(_model_cache.keys()),
|
| 381 |
"kokoro_voices": KOKORO_VOICES_FR,
|
|
|
|
|
|
|
| 382 |
}
|
| 383 |
|
| 384 |
|
|
@@ -971,6 +1030,67 @@ async def generate_fish(
|
|
| 971 |
os.unlink(prev_path)
|
| 972 |
|
| 973 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
# ─── Entry point ──────────────────────────────────────────────────────────────
|
| 975 |
|
| 976 |
def main():
|
|
|
|
| 17 |
import io
|
| 18 |
import json
|
| 19 |
import os
|
| 20 |
+
import re
|
| 21 |
import sys
|
| 22 |
import tempfile
|
| 23 |
import threading
|
|
|
|
| 33 |
from fastapi.middleware.cors import CORSMiddleware
|
| 34 |
from fastapi.responses import FileResponse, JSONResponse, Response, StreamingResponse
|
| 35 |
|
| 36 |
+
# ── Voxtral TTS (vLLM-Omni, Mistral AI) ──────────────────────────────────────
|
| 37 |
+
_VOXTRAL_URL = os.environ.get("VOXTRAL_URL", "http://localhost:8000")
|
| 38 |
+
_VOXTRAL_MODEL = "mistralai/Voxtral-4B-TTS-2603"
|
| 39 |
+
|
| 40 |
# ── Fish-Speech ───────────────────────────────────────────────────────────────
|
| 41 |
FISH_SPEECH_REPO = Path("/tmp/fish-speech")
|
| 42 |
FISH_SPEECH_MODEL = Path("/root/fish-speech-model")
|
|
|
|
| 299 |
)
|
| 300 |
|
| 301 |
|
| 302 |
+
# ─── French TTS preprocessing ─────────────────────────────────────────────────
|
| 303 |
+
|
| 304 |
+
FRENCH_ABBREVS = [
|
| 305 |
+
(r'\bM\.\s+', 'Monsieur '),
|
| 306 |
+
(r'\bMme\.?\s+', 'Madame '),
|
| 307 |
+
(r'\bMlle\.?\s+', 'Mademoiselle '),
|
| 308 |
+
(r'\bDr\.?\s+', 'Docteur '),
|
| 309 |
+
(r'\bPr\.?\s+', 'Professeur '),
|
| 310 |
+
(r'\bSt\.?\s+', 'Saint '),
|
| 311 |
+
(r'\betc\.(?!\w)', 'et cetera'),
|
| 312 |
+
(r'\bn°\s*(\d+)', r'numéro \1'),
|
| 313 |
+
(r'\b(\d{1,2})\s*h\s*(\d{2})\b', r'\1 heures \2'),
|
| 314 |
+
(r'\b(\d{1,2})\s*h\b', r'\1 heures'),
|
| 315 |
+
(r'\bp\.\s*(\d+)\b', r'page \1'),
|
| 316 |
+
]
|
| 317 |
+
|
| 318 |
+
FRENCH_NARRATOR_PROMPT = (
|
| 319 |
+
"Narrateur professionnel de livres audio français, voix grave, chaude et captivante. "
|
| 320 |
+
"Débit naturellement mesuré — ni précipité ni traînant. "
|
| 321 |
+
"Respectez scrupuleusement la ponctuation : "
|
| 322 |
+
"légère pause aux virgules, souffle marqué aux points, "
|
| 323 |
+
"pause longue et respirée aux doubles sauts de paragraphe. "
|
| 324 |
+
"Aux guillemets « », adoptez un registre légèrement plus direct et personnel pour le dialogue, "
|
| 325 |
+
"puis revenez au ton narratif après le ». "
|
| 326 |
+
"Les points de suspension (...) et les tirets (—) appellent une vraie pause respiratoire. "
|
| 327 |
+
"Ton légèrement plus grave et plus riche que la conversation ordinaire, "
|
| 328 |
+
"comme un conteur au coin du feu. "
|
| 329 |
+
"Restez cohérent du début à la fin — même timbre, même rythme de fond."
|
| 330 |
+
)
|
| 331 |
+
|
| 332 |
+
|
| 333 |
+
def preprocess_french(text: str) -> str:
|
| 334 |
+
"""French TTS prosody preprocessing (arXiv:2508.17494).
|
| 335 |
+
Expands abbreviations and inserts natural pause markers for the TTS model.
|
| 336 |
+
"""
|
| 337 |
+
for pattern, repl in FRENCH_ABBREVS:
|
| 338 |
+
text = re.sub(pattern, repl, text)
|
| 339 |
+
# Guillemet spacing
|
| 340 |
+
text = re.sub(r'«\s*', '« ', text)
|
| 341 |
+
text = re.sub(r'\s*»', ' »', text)
|
| 342 |
+
# Paragraph breaks → strong pause
|
| 343 |
+
text = re.sub(r'\n\s*\n', ' ... ', text)
|
| 344 |
+
# Em-dash → spaced pause
|
| 345 |
+
text = re.sub(r'\s*—\s*', ' — ', text)
|
| 346 |
+
# Normalize ellipsis
|
| 347 |
+
text = re.sub(r'\.{3,}', '...', text)
|
| 348 |
+
# Normalize whitespace first, then add post-sentence double space
|
| 349 |
+
text = re.sub(r'[ \t]+', ' ', text)
|
| 350 |
+
text = re.sub(r'([!?])', r'\1 ', text)
|
| 351 |
+
return text.strip()
|
| 352 |
+
|
| 353 |
+
|
| 354 |
# ─── Helpers ──────────────────────────────────────────────────────────────────
|
| 355 |
|
| 356 |
def _to_wav_b64(audio: np.ndarray, sr: int) -> str:
|
|
|
|
| 436 |
"queue_depth": _generation_waiters,
|
| 437 |
"cached_models": list(_model_cache.keys()),
|
| 438 |
"kokoro_voices": KOKORO_VOICES_FR,
|
| 439 |
+
"voxtral_url": _VOXTRAL_URL,
|
| 440 |
+
"voxtral_model": _VOXTRAL_MODEL,
|
| 441 |
}
|
| 442 |
|
| 443 |
|
|
|
|
| 1030 |
os.unlink(prev_path)
|
| 1031 |
|
| 1032 |
|
| 1033 |
+
@app.post("/generate/voxtral")
|
| 1034 |
+
async def generate_voxtral(
|
| 1035 |
+
text: str = Form(...),
|
| 1036 |
+
ref_wav: UploadFile = File(None),
|
| 1037 |
+
ref_text: str = Form(""),
|
| 1038 |
+
):
|
| 1039 |
+
"""Generate speech via Voxtral TTS (vLLM-Omni server at VOXTRAL_URL).
|
| 1040 |
+
|
| 1041 |
+
Requires a running `python3 voxtral_server.py` instance.
|
| 1042 |
+
Optionally upload a reference WAV for voice cloning; falls back to
|
| 1043 |
+
narrator_reference.wav in the same directory as this script.
|
| 1044 |
+
"""
|
| 1045 |
+
if not _engine_enabled("voxtral"):
|
| 1046 |
+
raise HTTPException(status_code=503, detail="Voxtral engine not enabled on this server.")
|
| 1047 |
+
|
| 1048 |
+
try:
|
| 1049 |
+
import httpx as _httpx
|
| 1050 |
+
except ImportError:
|
| 1051 |
+
raise HTTPException(status_code=503, detail="httpx not installed. Run: pip install httpx")
|
| 1052 |
+
|
| 1053 |
+
# Preprocess French text for better prosody
|
| 1054 |
+
processed = preprocess_french(text)
|
| 1055 |
+
|
| 1056 |
+
ref_b64: str | None = None
|
| 1057 |
+
if ref_wav and ref_wav.filename:
|
| 1058 |
+
ref_bytes = await ref_wav.read()
|
| 1059 |
+
if len(ref_bytes) > MAX_AUDIO_BYTES:
|
| 1060 |
+
raise HTTPException(status_code=400, detail=_AUDIO_TOO_LARGE_MSG.format(size_mb=len(ref_bytes) / 1024 / 1024))
|
| 1061 |
+
ref_b64 = "data:audio/wav;base64," + base64.b64encode(ref_bytes).decode()
|
| 1062 |
+
else:
|
| 1063 |
+
narrator_ref = Path(__file__).parent / "narrator_reference.wav"
|
| 1064 |
+
if narrator_ref.exists():
|
| 1065 |
+
ref_b64 = "data:audio/wav;base64," + base64.b64encode(narrator_ref.read_bytes()).decode()
|
| 1066 |
+
|
| 1067 |
+
payload: dict = {
|
| 1068 |
+
"model": _VOXTRAL_MODEL,
|
| 1069 |
+
"input": processed,
|
| 1070 |
+
"response_format": "wav",
|
| 1071 |
+
}
|
| 1072 |
+
if ref_b64:
|
| 1073 |
+
payload["ref_audio"] = ref_b64
|
| 1074 |
+
payload["ref_text"] = ref_text
|
| 1075 |
+
|
| 1076 |
+
def _run():
|
| 1077 |
+
try:
|
| 1078 |
+
r = _httpx.post(f"{_VOXTRAL_URL}/v1/audio/speech", json=payload, timeout=120.0)
|
| 1079 |
+
r.raise_for_status()
|
| 1080 |
+
return r.content
|
| 1081 |
+
except _httpx.ConnectError:
|
| 1082 |
+
raise RuntimeError(
|
| 1083 |
+
f"Voxtral server not reachable at {_VOXTRAL_URL}. "
|
| 1084 |
+
"Start it with: python3 voxtral_server.py"
|
| 1085 |
+
)
|
| 1086 |
+
|
| 1087 |
+
try:
|
| 1088 |
+
wav_bytes = await asyncio.to_thread(_run)
|
| 1089 |
+
return Response(content=wav_bytes, media_type="audio/wav")
|
| 1090 |
+
except Exception as e:
|
| 1091 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 1092 |
+
|
| 1093 |
+
|
| 1094 |
# ─── Entry point ──────────────────────────────────────────────────────────────
|
| 1095 |
|
| 1096 |
def main():
|