ChatterboxTTS-DNXS-Spokenwordv1 / wrapper /chunk_synthesizer.py
danneauxs
Deploy complete ChatterboxTTS system with utils, tools, and wrapper modules
346d87a
raw
history blame
8.4 kB
from pathlib import Path
import torch
import time
import re
from pydub import AudioSegment
from modules.tts_engine import load_optimized_model
from modules.file_manager import ensure_voice_sample_compatibility, list_voice_samples
from modules.audio_processor import apply_smart_fade_memory, smart_audio_validation_memory, process_audio_with_trimming_and_silence
from config.config import *
def get_original_voice_from_log(book_name):
"""Extract original voice name from run log"""
audiobook_root = Path(AUDIOBOOK_ROOT)
log_file = audiobook_root / book_name / "run.log"
if log_file.exists():
try:
with open(log_file, 'r', encoding='utf-8') as f:
for line in f:
line = line.strip()
if line.startswith("Voice: ") or line.startswith("Voice used: "):
voice_name = line.split(": ", 1)[1].strip()
print(f"πŸ“„ Found original voice in log: {voice_name}")
return voice_name
except Exception as e:
print(f"⚠️ Error reading run log: {e}")
return None
def get_original_voice_from_filename(book_name):
"""Extract voice name from existing audiobook filename"""
audiobook_root = Path(AUDIOBOOK_ROOT)
book_dir = audiobook_root / book_name
# Look for WAV files with voice pattern: BookName [VoiceName].wav
for wav_file in book_dir.glob("*.wav"):
match = re.search(r'\[([^\]]+)\]\.wav$', wav_file.name)
if match:
voice_name = match.group(1)
print(f"πŸ“ Found original voice in filename: {voice_name}")
return voice_name
# Look for M4B files with voice pattern: BookName[VoiceName].m4b
for m4b_file in book_dir.glob("*.m4b"):
match = re.search(r'\[([^\]]+)\]\.m4b$', m4b_file.name)
if match:
voice_name = match.group(1)
print(f"πŸ“ Found original voice in M4B filename: {voice_name}")
return voice_name
return None
def find_voice_file_by_name(voice_name):
"""Find voice file by name in Voice_Samples directory"""
voice_files = list_voice_samples()
# Exact match first
for voice_file in voice_files:
if voice_file.stem == voice_name:
print(f"βœ… Found exact voice match: {voice_file.name}")
return voice_file
# Partial match (case insensitive)
voice_name_lower = voice_name.lower()
for voice_file in voice_files:
if voice_name_lower in voice_file.stem.lower():
print(f"βœ… Found partial voice match: {voice_file.name}")
return voice_file
return None
def get_tts_params_for_chunk(chunk):
"""Extract TTS parameters from chunk data or prompt user"""
# Check if chunk has TTS params stored
if 'tts_params' in chunk:
tts_params = chunk['tts_params']
print(f"πŸ“Š Using stored TTS params: exag={tts_params.get('exaggeration', 1.0)}, cfg={tts_params.get('cfg_weight', 0.7)}, temp={tts_params.get('temperature', 0.7)}")
return tts_params
# Prompt user for TTS parameters
print(f"\nβš™οΈ TTS Parameters for chunk synthesis:")
def get_float_input(prompt, default):
while True:
try:
value = input(f"{prompt} [{default}]: ").strip()
if not value:
return default
return float(value)
except ValueError:
print(f"❌ Invalid input. Please enter a valid number.")
exaggeration = get_float_input("Exaggeration", DEFAULT_EXAGGERATION)
cfg_weight = get_float_input("CFG Weight", DEFAULT_CFG_WEIGHT)
temperature = get_float_input("Temperature", DEFAULT_TEMPERATURE)
return {
'exaggeration': exaggeration,
'cfg_weight': cfg_weight,
'temperature': temperature
}
def synthesize_chunk(chunk, index, book_name, audio_dir, revision=False, chunks_json_path=None, override_voice_name=None):
"""Generate audio for a single chunk using specified or detected voice and TTS parameters"""
filename = f"chunk_{index+1:05d}_rev.wav" if revision else f"chunk_{index+1:05d}.wav"
out_path = Path(audio_dir) / filename
try:
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load TTS model
print(f"πŸ€– Loading TTS model for chunk synthesis...")
model = load_optimized_model(device)
# Determine voice to use
if override_voice_name:
# Use explicitly provided voice
print(f"🎀 Using explicitly selected voice: {override_voice_name}")
voice_path = find_voice_file_by_name(override_voice_name)
voice_name = override_voice_name
detection_method = "user_selected"
else:
# Use enhanced voice detection
print(f"πŸ” Detecting original voice for book: {book_name}")
from modules.voice_detector import detect_voice_for_book
voice_name, voice_path, detection_method = detect_voice_for_book(book_name, chunks_json_path)
# Fallback to first available voice if detection failed
if not voice_path:
print(f"⚠️ Voice not found, using first available voice")
voice_files = list_voice_samples()
if not voice_files:
print("❌ No voice samples found")
return None
voice_path = voice_files[0]
voice_name = voice_path.stem
detection_method = "fallback_first_available"
print(f"🎀 Using voice: {voice_name} (method: {detection_method})")
compatible_voice = ensure_voice_sample_compatibility(voice_path)
# Get TTS parameters for this chunk
tts_params = get_tts_params_for_chunk(chunk)
# Prepare model with voice
model.prepare_conditionals(compatible_voice)
# Get chunk text
chunk_text = chunk.get('text', '')
if not chunk_text:
print("❌ No text found in chunk")
return None
print(f"🎀 Synthesizing: {chunk_text[:50]}...")
print(f"πŸ“Š TTS params: exag={tts_params['exaggeration']}, cfg={tts_params['cfg_weight']}, temp={tts_params['temperature']}")
# Generate audio with specified parameters
with torch.no_grad():
wav = model.generate(chunk_text,
exaggeration=tts_params['exaggeration'],
cfg_weight=tts_params['cfg_weight'],
temperature=tts_params['temperature']).detach().cpu()
if wav.dim() == 1:
wav = wav.unsqueeze(0)
# Convert tensor to AudioSegment for processing
import io
import soundfile as sf
wav_np = wav.squeeze().numpy()
with io.BytesIO() as wav_buffer:
sf.write(wav_buffer, wav_np, model.sr, format='wav')
wav_buffer.seek(0)
audio_segment = AudioSegment.from_wav(wav_buffer)
# Apply audio processing
audio_segment = apply_smart_fade_memory(audio_segment)
audio_segment, is_quarantined = smart_audio_validation_memory(audio_segment, model.sr)
# Apply trimming and contextual silence based on boundary type
boundary_type = chunk.get('boundary_type', 'none')
if boundary_type and boundary_type != "none":
audio_segment = process_audio_with_trimming_and_silence(audio_segment, boundary_type)
else:
# Apply trimming even without boundary type if enabled
if ENABLE_AUDIO_TRIMMING:
from modules.audio_processor import trim_audio_endpoint
audio_segment = trim_audio_endpoint(audio_segment)
# Save final audio
audio_segment.export(out_path, format="wav")
print(f"βœ… Saved synthesized chunk: {out_path.name}")
# Clean up model
del model
torch.cuda.empty_cache()
return str(out_path)
except Exception as e:
print(f"❌ Failed to synthesize chunk: {e}")
import traceback
traceback.print_exc()
return None