""" TTS Engine Module Handles ChatterboxTTS interface, model loading, and chunk processing coordination """ import torch import gc import time import logging import shutil import sys import numpy as np from datetime import timedelta from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path import torchaudio as ta from config.config import * from modules.text_processor import smart_punctuate, sentence_chunk_text, detect_content_boundaries def find_chunks_json_file(book_name): """Find the corresponding chunks JSON file for a book""" from config.config import AUDIOBOOK_ROOT # Look in the TTS processing directory tts_chunks_dir = AUDIOBOOK_ROOT / book_name / "TTS" / "text_chunks" json_path = tts_chunks_dir / "chunks_info.json" if json_path.exists(): return json_path # Also check old Text_Input location for backwards compatibility text_input_dir = Path("Text_Input") possible_names = [ f"{book_name}_chunks.json", f"{book_name.lower()}_chunks.json", f"{book_name.replace(' ', '_')}_chunks.json" ] for name in possible_names: old_json_path = text_input_dir / name if old_json_path.exists(): return old_json_path return None from modules.audio_processor import ( smart_audio_validation, apply_smart_fade, add_chunk_end_silence, add_contextual_silence, pause_for_chunk_review, get_chunk_audio_duration, has_mid_energy_drop, apply_smart_fade_memory, smart_audio_validation_memory ) from modules.file_manager import ( setup_book_directories, find_book_files, ensure_voice_sample_compatibility, combine_audio_chunks, get_audio_files_in_directory, convert_to_m4b, add_metadata_to_m4b ) from modules.progress_tracker import setup_logging, log_chunk_progress, log_run # Global shutdown flag shutdown_requested = False # Console colors RED = '\033[91m' GREEN = '\033[92m' YELLOW = '\033[93m' CYAN = '\033[96m' RESET = '\033[0m' import random import numpy as np import torch def set_seed(seed_value: int): """ Sets the seed for torch, random, and numpy for reproducibility. This is called if a non-zero seed is provided for generation. """ torch.manual_seed(seed_value) if torch.cuda.is_available(): torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed_all(seed_value) # if using multi-GPU if torch.backends.mps.is_available(): # Check if torch.mps exists before calling if hasattr(torch, 'mps') and torch.mps.is_available(): torch.mps.manual_seed(seed_value) random.seed(seed_value) np.random.seed(seed_value) logging.info(f"Global seed set to: {seed_value}") # ============================================================================ # MEMORY AND MODEL MANAGEMENT # ============================================================================ def monitor_gpu_activity(operation_name): """Lightweight GPU monitoring for high-speed processing""" # Disabled expensive pynvml queries to free up GPU cycles if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated() / 1024**3 # Skip GPU utilization queries during production runs return allocated, 0 return 0, 0 def optimize_memory_usage(): """Aggressive memory management for 8GB VRAM""" torch.cuda.empty_cache() gc.collect() if torch.cuda.is_available(): torch.cuda.ipc_collect() def monitor_vram_usage(operation_name=""): """Real-time VRAM monitoring""" if torch.cuda.is_available(): allocated = torch.cuda.memory_allocated() / 1024**3 reserved = torch.cuda.memory_reserved() / 1024**3 if allocated > VRAM_SAFETY_THRESHOLD: logging.warning(f"โš ๏ธ High VRAM usage during {operation_name}: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved") optimize_memory_usage() return allocated, reserved return 0, 0 def get_optimal_workers(): """Dynamic worker allocation based on VRAM usage""" if not USE_DYNAMIC_WORKERS: return MAX_WORKERS allocated_vram = torch.cuda.memory_allocated() / 1024**3 if allocated_vram < 5.0: return min(TEST_MAX_WORKERS, MAX_WORKERS) elif allocated_vram < VRAM_SAFETY_THRESHOLD: return min(2, MAX_WORKERS) else: return 1 def prewarm_model_with_voice(model, voice_path, tts_params=None): """ Pre-warm the TTS model with a voice sample to eliminate cold start quality issues. Args: model: Loaded TTS model voice_path: Path to voice sample file tts_params: Optional TTS parameters for pre-warming (uses defaults if None) Returns: model: The pre-warmed model (same object, but with cached conditioning) """ import tempfile import os from modules.file_manager import ensure_voice_sample_compatibility try: print("๐Ÿ”ฅ Pre-warming model with voice sample...") # Prepare voice for TTS compatible_voice = ensure_voice_sample_compatibility(voice_path) # Set up default TTS parameters if none provided if tts_params is None: tts_params = { 'exaggeration': 0.5, 'cfg_weight': 0.5, 'temperature': 0.9 } # Prepare voice conditionals model.prepare_conditionals(compatible_voice) # Generate a short dummy audio to fully warm up the model dummy_text = "The quick brown fox jumps over the lazy dog." print(f"๐ŸŽค Generating warm-up audio: '{dummy_text}'") # Generate dummy audio with the voice and parameters wav_np = model.generate( dummy_text, exaggeration=tts_params['exaggeration'], cfg_weight=tts_params['cfg_weight'], temperature=tts_params['temperature'] ) print("โœ… Model pre-warming completed - first chunk quality optimized") # Clean up any temporary audio data (don't save the dummy audio) del wav_np return model except Exception as e: print(f"โš ๏ธ Pre-warming failed: {e}") print("๐Ÿ“ Model will still work but first chunk may have quality variations") return model def get_best_available_device(): """Detect and return the best available device with proper fallback""" try: if torch.cuda.is_available(): # Test CUDA with a simple operation test_tensor = torch.tensor([1.0]).to("cuda") del test_tensor torch.cuda.empty_cache() return "cuda" except Exception as e: logging.warning(f"CUDA test failed: {e}") try: if torch.backends.mps.is_available(): # Test MPS with a simple operation test_tensor = torch.tensor([1.0]).to("mps") del test_tensor return "mps" except Exception as e: logging.warning(f"MPS test failed: {e}") return "cpu" def load_optimized_model(device): """Load TTS model with memory optimizations""" from src.chatterbox.tts import ChatterboxTTS try: # Try to load with FP16 if supported model = ChatterboxTTS.from_pretrained(device=device, torch_dtype=torch.float16) logging.info("โœ… Loaded model in FP16 mode (halved VRAM usage)") except: # Fallback to default loading model = ChatterboxTTS.from_pretrained(device=device) logging.info("โš ๏ธ Using FP32 mode (FP16 not supported)") # Only apply eval() and benchmark if the model has these attributes if hasattr(model, 'eval'): model.eval() # Set CUDNN benchmark for performance (if available) if torch.backends.cudnn.is_available(): torch.backends.cudnn.benchmark = True return model # ============================================================================ # CHUNK PROCESSING # ============================================================================ def patch_alignment_layer(tfmr, alignment_layer_idx=12): """Patch alignment layer to avoid recursion""" from types import MethodType target_layer = tfmr.layers[alignment_layer_idx].self_attn original_forward = target_layer.forward def patched_forward(self, *args, **kwargs): kwargs['output_attentions'] = True return original_forward(*args, **kwargs) target_layer.forward = MethodType(patched_forward, target_layer) def process_batch( batch, text_chunks_dir, audio_chunks_dir, voice_path, tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, seed=0, enable_asr=None ): if seed != 0: set_seed(seed) """ Process a batch of chunks using the batch-enabled TTS model. """ from pydub import AudioSegment import io import soundfile as sf # 1. Prepare batch for TTS texts = [chunk_data['text'] for chunk_data in batch] # All params are the same, so we take them from the first chunk shared_tts_params = batch[0].get("tts_params", tts_params) supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"} tts_args = {k: v for k, v in shared_tts_params.items() if k in supported_params} # 2. Generate audio in a batch try: with torch.no_grad(): wavs = model.generate_batch(texts, **tts_args) except Exception as e: logging.error(f"โŒ Batch TTS generation failed: {e}") # Fallback to individual processing for this batch results = [] for chunk_data in batch: i = chunk_data['index'] chunk = chunk_data['text'] boundary_type = chunk_data.get("boundary_type", "none") chunk_tts_params = chunk_data.get("tts_params", tts_params) result = process_one_chunk(i, chunk, text_chunks_dir, audio_chunks_dir, voice_path, chunk_tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, boundary_type, enable_asr) results.append(result) return results # 3. Process and save each audio file from the batch batch_results = [] for i, wav_tensor in enumerate(wavs): chunk_data = batch[i] chunk_index = chunk_data['index'] boundary_type = chunk_data.get("boundary_type", "none") chunk_id_str = f"{chunk_index+1:05}" if wav_tensor.dim() == 1: wav_tensor = wav_tensor.unsqueeze(0) wav_np = wav_tensor.squeeze().cpu().numpy() with io.BytesIO() as wav_buffer: sf.write(wav_buffer, wav_np, model.sr, format='wav') wav_buffer.seek(0) audio_segment = AudioSegment.from_wav(wav_buffer) # Apply trimming and contextual silence from modules.audio_processor import process_audio_with_trimming_and_silence, trim_audio_endpoint if boundary_type and boundary_type != "none": final_audio = process_audio_with_trimming_and_silence(audio_segment, boundary_type) elif ENABLE_AUDIO_TRIMMING: final_audio = trim_audio_endpoint(audio_segment) else: final_audio = audio_segment # Final save final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" final_audio.export(final_path, format="wav") logging.info(f"โœ… Saved final chunk from batch: {final_path.name}") batch_results.append((chunk_index, final_path)) return batch_results def process_one_chunk( i, chunk, text_chunks_dir, audio_chunks_dir, voice_path, tts_params, start_time, total_chunks, punc_norm, basename, log_run_func, log_path, device, model, asr_model, seed=0, boundary_type="none", enable_asr=None ): if seed != 0: set_seed(seed) """Enhanced chunk processing with quality control, contextual silence, and deep cleanup""" import difflib from pydub import AudioSegment chunk_id_str = f"{i+1:05}" chunk_path = text_chunks_dir / f"chunk_{chunk_id_str}.txt" with open(chunk_path, 'w', encoding='utf-8') as cf: cf.write(chunk) chunk_audio_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" # ============================================================================ # ENHANCED PERIODIC DEEP CLEANUP # ============================================================================ cleanup_interval = CLEANUP_INTERVAL # Skip cleanup on model reinitialization chunks to avoid conflicts if (i + 1) % cleanup_interval == 0 and (i + 1) % BATCH_SIZE != 0: print(f"\n๐Ÿงน {YELLOW}DEEP CLEANUP at chunk {i+1}/{total_chunks}...{RESET}") # Enhanced VRAM monitoring before cleanup allocated_before = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 reserved_before = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 print(f" Before: VRAM Allocated: {allocated_before:.1f}GB | Reserved: {reserved_before:.1f}GB") # Bulk temp file cleanup print(" ๐Ÿ—‘๏ธ Cleaning bulk temporary files...") temp_patterns = ["*_try*.wav", "*_pre.wav", "*_fade*.wav", "*_debug*.wav", "*_temp*.wav", "*_backup*.wav"] total_temp_files = 0 for pattern in temp_patterns: temp_files = list(audio_chunks_dir.glob(pattern)) for temp_file in temp_files: temp_file.unlink(missing_ok=True) total_temp_files += len(temp_files) if total_temp_files > 0: print(f" ๐Ÿ—‘๏ธ Removed {total_temp_files} temporary audio files") # Aggressive CUDA context reset print(" ๐Ÿ”„ Performing aggressive CUDA context reset...") torch.cuda.synchronize() torch.cuda.empty_cache() torch.cuda.ipc_collect() # Force CUDA context reset if hasattr(torch.cuda, 'reset_peak_memory_stats'): torch.cuda.reset_peak_memory_stats() if hasattr(torch._C, '_cuda_clearCublasWorkspaces'): torch._C._cuda_clearCublasWorkspaces() # Force garbage collection multiple times for _ in range(3): gc.collect() # Clear model cache if it has one if hasattr(model, 'clear_cache'): model.clear_cache() elif hasattr(model, 'reset_states'): model.reset_states() # Brief pause to let GPU settle time.sleep(1.0) # Monitor after cleanup allocated_after = torch.cuda.memory_allocated() / 1024**3 if torch.cuda.is_available() else 0 reserved_after = torch.cuda.memory_reserved() / 1024**3 if torch.cuda.is_available() else 0 print(f" After: VRAM Allocated: {allocated_after:.1f}GB | Reserved: {reserved_after:.1f}GB") print(f" Freed: {allocated_before - allocated_after:.1f}GB allocated, {reserved_before - reserved_after:.1f}GB reserved") print(f"๐Ÿงน {GREEN}Deep cleanup complete!{RESET}\n") best_sim, best_asr_text = -1, "" wav_path_active = None attempt_paths = [] mid_drop_retries = 0 max_mid_drop_retries = 2 # Enhanced regeneration loop with quality validation max_attempts = MAX_REGENERATION_ATTEMPTS if ENABLE_REGENERATION_LOOP else 2 current_tts_params = tts_params.copy() # Debug: Log the initial parameters for this chunk logging.info(f"๐ŸŽ›๏ธ Chunk {chunk_id_str} initial TTS params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}, min_p={current_tts_params.get('min_p', 'N/A'):.3f}") for attempt_num in range(max_attempts): logging.info(f"๐Ÿ” Starting TTS for chunk {chunk_id_str}, attempt {attempt_num + 1}/{max_attempts}") if attempt_num > 0: logging.info(f"๐Ÿ”ง Adjusted params: exag={current_tts_params.get('exaggeration', 'N/A'):.3f}, cfg={current_tts_params.get('cfg_weight', 'N/A'):.3f}, temp={current_tts_params.get('temperature', 'N/A'):.3f}") try: # Filter to only supported ChatterboxTTS parameters supported_params = {"exaggeration", "cfg_weight", "temperature", "min_p", "top_p", "repetition_penalty"} tts_args = {k: v for k, v in current_tts_params.items() if k in supported_params} # monitor_gpu_activity(f"Before TTS chunk_{chunk_id_str}") # Disabled for speed with torch.no_grad(): wav = model.generate(chunk, **tts_args).detach().cpu() # monitor_gpu_activity(f"After TTS chunk_{chunk_id_str}") # Disabled for speed if wav.dim() == 1: wav = wav.unsqueeze(0) # Convert tensor to AudioSegment for in-memory processing import io import soundfile as sf from pydub import AudioSegment # Convert wav tensor to AudioSegment (in memory) wav_np = wav.squeeze().numpy() with io.BytesIO() as wav_buffer: sf.write(wav_buffer, wav_np, model.sr, format='wav') wav_buffer.seek(0) audio_segment = AudioSegment.from_wav(wav_buffer) # Enhanced quality validation quality_score = 1.0 # Start with perfect score # Legacy mid-energy drop check (converted to score) if ENABLE_MID_DROP_CHECK and has_mid_energy_drop(wav, model.sr): quality_score *= 0.3 # Significant penalty for mid-drop logging.info(f"โš ๏ธ Mid-chunk energy drop detected in {chunk_id_str}") # Enhanced quality validation (if enabled) if ENABLE_REGENERATION_LOOP: from modules.audio_processor import evaluate_chunk_quality # Pass existing ASR model to avoid loading duplicate composite_score = evaluate_chunk_quality(audio_segment, chunk, include_spectral=True, asr_model=asr_model) quality_score *= composite_score logging.info(f"๐Ÿ“Š Quality score for {chunk_id_str}: {quality_score:.3f} (composite: {composite_score:.3f})") # ASR validation (memory-based processing) asr_score = 1.0 # Default to passed if ASR disabled # Use parameter if provided, otherwise fall back to config asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR if asr_enabled and asr_model is not None: from modules.audio_processor import calculate_text_similarity try: # Process ASR completely in memory - no disk writes samples = np.array(audio_segment.get_array_of_samples()) if audio_segment.channels == 2: samples = samples.reshape((-1, 2)).mean(axis=1) # Normalize to float32 for ASR model audio_np = samples.astype(np.float32) / audio_segment.max_possible_amplitude result = asr_model.transcribe(audio_np) if not isinstance(result, dict) or "text" not in result: raise ValueError(f"Invalid ASR result type: {type(result)}") asr_text = result.get("text", "").strip() asr_score = calculate_text_similarity(punc_norm(chunk), asr_text) logging.info(f"๐ŸŽค ASR similarity for chunk {chunk_id_str}: {asr_score:.3f} - Expected: '{punc_norm(chunk)}' Got: '{asr_text}'") except Exception as e: logging.error(f"โŒ ASR failed for {chunk_id_str}: {e}") asr_score = 0.8 # Use neutral score instead of 0 to avoid regeneration # Include ASR score in overall quality quality_score *= asr_score # Final quality check with all validations if quality_score >= QUALITY_THRESHOLD or attempt_num == max_attempts - 1: if quality_score >= QUALITY_THRESHOLD: logging.info(f"โœ… Quality acceptable for {chunk_id_str} on attempt {attempt_num + 1} (final score: {quality_score:.3f})") else: logging.info(f"โš ๏ธ Max attempts reached for {chunk_id_str}, accepting best effort (final score: {quality_score:.3f})") # Quality acceptable or max attempts reached, continue with processing final_audio = audio_segment best_sim = asr_score if asr_enabled else 1.0 best_asr_text = asr_text if asr_enabled and 'asr_text' in locals() else "" break else: # Quality too low, adjust parameters for retry logging.info(f"๐Ÿ”„ Quality below threshold ({quality_score:.3f} < {QUALITY_THRESHOLD}), adjusting parameters for retry {attempt_num + 2}") from modules.audio_processor import adjust_parameters_for_retry current_tts_params = adjust_parameters_for_retry(current_tts_params, quality_score, attempt_num) continue except Exception as e: import traceback logging.error(f"Exception during TTS attempt {attempt_num + 1} for chunk {chunk_id_str}: {e}") traceback.print_exc() continue if 'final_audio' not in locals(): logging.info(f"โŒ Chunk {chunk_id_str} failed all attempts.") return None, None # Apply trimming and contextual silence in memory before final save from modules.audio_processor import process_audio_with_trimming_and_silence if boundary_type and boundary_type != "none": final_audio = process_audio_with_trimming_and_silence(final_audio, boundary_type) print(f"๐Ÿ”‡ Added {boundary_type} silence to chunk {i+1:05}") else: # Apply trimming even without boundary type if enabled if ENABLE_AUDIO_TRIMMING: from modules.audio_processor import trim_audio_endpoint final_audio = trim_audio_endpoint(final_audio) # Note: ENABLE_CHUNK_END_SILENCE is now handled by punctuation-specific silence # The new system provides more precise silence based on actual punctuation # Final save - only disk write in entire process final_path = audio_chunks_dir / f"chunk_{chunk_id_str}.wav" final_audio.export(final_path, format="wav") logging.info(f"โœ… Saved final chunk: {final_path.name}") # No intermediate file cleanup needed - all processing done in memory # Log details - only log ASR failures if asr_enabled and best_sim < 0.8: log_run_func(f"ASR VALIDATION FAILED - Chunk {chunk_id_str}:\nExpected:\n{chunk}\nActual:\n{best_asr_text}\nSimilarity: {best_sim:.3f}\n" + "="*50, log_path) elif not asr_enabled: log_run_func(f"Chunk {chunk_id_str}: Original text: {chunk}", log_path) # Silence already added in memory above - no disk processing needed # Enhanced regular cleanup (every chunk) del wav optimize_memory_usage() # Additional per-chunk cleanup for long runs if (i + 1) % 50 == 0: torch.cuda.empty_cache() gc.collect() return i, final_path # ============================================================================ # MAIN BOOK PROCESSING FUNCTION # ============================================================================ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer from wrapper.chunk_loader import save_chunks def smooth_sentiment_scores(scores, index, method="rolling", window=3): """ Apply sentiment smoothing to prevent harsh emotional transitions. Args: scores: List of compound sentiment scores index: Current chunk index method: "rolling" for moving average, "exp_decay" for exponential decay window: Number of previous chunks to consider Returns: float: Smoothed sentiment score """ if index == 0: return scores[0] start_idx = max(0, index - window + 1) window_scores = scores[start_idx:index + 1] if method == "rolling": return sum(window_scores) / len(window_scores) elif method == "exp_decay": weights = SENTIMENT_EXP_DECAY_WEIGHTS[:len(window_scores)] weighted_sum = sum(w * s for w, s in zip(weights, reversed(window_scores))) weight_sum = sum(weights[:len(window_scores)]) return weighted_sum / weight_sum if weight_sum > 0 else window_scores[-1] else: return scores[index] # No smoothing def generate_enriched_chunks(text_file, output_dir, user_tts_params=None, quality_params=None, config_params=None, voice_name=None): """Reads a text file, performs VADER sentiment analysis, and returns enriched chunks.""" analyzer = SentimentIntensityAnalyzer() # Extract quality parameters for JSON generation (GUI overrides config) if quality_params: enable_smoothing = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING) smoothing_window = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW) smoothing_method = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD) print(f"๐Ÿ”ง JSON Generation: Using GUI smoothing settings - Enabled: {enable_smoothing}, Window: {smoothing_window}, Method: {smoothing_method}") else: enable_smoothing = ENABLE_SENTIMENT_SMOOTHING smoothing_window = SENTIMENT_SMOOTHING_WINDOW smoothing_method = SENTIMENT_SMOOTHING_METHOD print(f"๐Ÿ”ง JSON Generation: Using config smoothing settings - Enabled: {enable_smoothing}") # Extract VADER sensitivity parameters (GUI overrides config) if config_params: vader_exag_sensitivity = config_params.get('vader_exag_sensitivity', VADER_EXAGGERATION_SENSITIVITY) vader_cfg_sensitivity = config_params.get('vader_cfg_sensitivity', VADER_CFG_WEIGHT_SENSITIVITY) vader_temp_sensitivity = config_params.get('vader_temp_sensitivity', VADER_TEMPERATURE_SENSITIVITY) print(f"๐Ÿ”ง JSON Generation: Using GUI VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}") else: vader_exag_sensitivity = VADER_EXAGGERATION_SENSITIVITY vader_cfg_sensitivity = VADER_CFG_WEIGHT_SENSITIVITY vader_temp_sensitivity = VADER_TEMPERATURE_SENSITIVITY print(f"๐Ÿ”ง JSON Generation: Using config VADER sensitivity - Exag: {vader_exag_sensitivity}, CFG: {vader_cfg_sensitivity}, Temp: {vader_temp_sensitivity}") raw_text = text_file.read_text(encoding='utf-8') cleaned = smart_punctuate(raw_text) chunks = sentence_chunk_text(cleaned) # Use user-provided parameters as base, or fall back to config defaults if user_tts_params: base_exaggeration = user_tts_params.get('exaggeration', BASE_EXAGGERATION) base_cfg_weight = user_tts_params.get('cfg_weight', BASE_CFG_WEIGHT) base_temperature = user_tts_params.get('temperature', BASE_TEMPERATURE) base_min_p = user_tts_params.get('min_p', DEFAULT_MIN_P) base_top_p = user_tts_params.get('top_p', DEFAULT_TOP_P) base_repetition_penalty = user_tts_params.get('repetition_penalty', DEFAULT_REPETITION_PENALTY) use_vader = user_tts_params.get('use_vader', True) # Default to True for backward compatibility else: base_exaggeration = BASE_EXAGGERATION base_cfg_weight = BASE_CFG_WEIGHT base_temperature = BASE_TEMPERATURE base_min_p = DEFAULT_MIN_P base_top_p = DEFAULT_TOP_P base_repetition_penalty = DEFAULT_REPETITION_PENALTY use_vader = True # Default behavior enriched = [] chunk_texts = [chunk_text for chunk_text, _ in chunks] # First pass: collect all sentiment scores raw_sentiment_scores = [] for chunk_text, _ in chunks: sentiment_scores = analyzer.polarity_scores(chunk_text) raw_sentiment_scores.append(sentiment_scores['compound']) # Second pass: apply smoothing and generate parameters for i, (chunk_text, is_para_end) in enumerate(chunks): # Get original sentiment score raw_compound_score = raw_sentiment_scores[i] # Apply sentiment smoothing if enabled (uses GUI settings, not config) if use_vader and enable_smoothing: compound_score = smooth_sentiment_scores( raw_sentiment_scores, i, method=smoothing_method, window=smoothing_window ) # Debug: Log sentiment changes if abs(compound_score - raw_compound_score) > 0.1: logging.info(f"๐Ÿ“Š Chunk {i+1:05}: sentiment smoothed {raw_compound_score:.3f} โ†’ {compound_score:.3f}") else: compound_score = raw_compound_score if use_vader: # Apply VADER sentiment adjustments using smoothed score exaggeration = base_exaggeration + (compound_score * vader_exag_sensitivity) cfg_weight = base_cfg_weight + (compound_score * vader_cfg_sensitivity) temperature = base_temperature + (compound_score * vader_temp_sensitivity) min_p = base_min_p + (compound_score * VADER_MIN_P_SENSITIVITY) repetition_penalty = base_repetition_penalty + (compound_score * VADER_REPETITION_PENALTY_SENSITIVITY) # Clamp values to defined min/max (ensure JSON values respect bounds) exaggeration = round(max(TTS_PARAM_MIN_EXAGGERATION, min(exaggeration, TTS_PARAM_MAX_EXAGGERATION)), 2) cfg_weight = round(max(TTS_PARAM_MIN_CFG_WEIGHT, min(cfg_weight, TTS_PARAM_MAX_CFG_WEIGHT)), 2) temperature = round(max(TTS_PARAM_MIN_TEMPERATURE, min(temperature, TTS_PARAM_MAX_TEMPERATURE)), 2) min_p = round(max(TTS_PARAM_MIN_MIN_P, min(min_p, TTS_PARAM_MAX_MIN_P)), 3) repetition_penalty = round(max(TTS_PARAM_MIN_REPETITION_PENALTY, min(repetition_penalty, TTS_PARAM_MAX_REPETITION_PENALTY)), 1) # Debug: Log VADER-adjusted parameters for significant changes if abs(exaggeration - base_exaggeration) > 0.05 or abs(cfg_weight - base_cfg_weight) > 0.05: logging.info(f"๐ŸŽญ Chunk {i+1:05}: VADER adjusted params - exag: {base_exaggeration:.2f}โ†’{exaggeration:.2f}, cfg: {base_cfg_weight:.2f}โ†’{cfg_weight:.2f}, sentiment: {compound_score:.3f}") else: # Use fixed base values (no VADER adjustment) exaggeration = base_exaggeration cfg_weight = base_cfg_weight temperature = base_temperature min_p = base_min_p repetition_penalty = base_repetition_penalty boundary_type = detect_content_boundaries(chunk_text, i, chunk_texts, is_para_end) enriched.append({ "index": i, "text": chunk_text, "word_count": len(chunk_text.split()), "boundary_type": boundary_type if boundary_type else "none", "sentiment_compound": compound_score, # Store smoothed score "sentiment_raw": raw_compound_score, # Store original score for reference "tts_params": { "exaggeration": exaggeration, "cfg_weight": cfg_weight, "temperature": temperature, "min_p": min_p, "top_p": base_top_p, # Top-P remains constant (not adjusted by VADER) "repetition_penalty": repetition_penalty } }) output_json_path = output_dir / "chunks_info.json" # Add voice metadata if provided if voice_name: # Try metadata method first try: # Create metadata entry as first element metadata = { "_metadata": True, "voice_used": voice_name, "generation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), "total_chunks": len(enriched) } enriched_with_metadata = [metadata] + enriched save_chunks(output_json_path, enriched_with_metadata) print(f"โœ… Saved voice metadata: {voice_name}") except Exception as e: # Fallback to comment method if metadata fails print(f"โš ๏ธ Metadata method failed, using comment fallback: {e}") save_chunks(output_json_path, enriched) # Add voice as comment from modules.voice_detector import add_voice_to_json add_voice_to_json(output_json_path, voice_name, method="comment") else: save_chunks(output_json_path, enriched) return enriched def process_book_folder(book_dir, voice_path, tts_params, device, skip_cleanup=False, enable_asr=None, quality_params=None, config_params=None, specific_text_file=None): """Enhanced book processing with batch processing to prevent hangs""" print(f"๐Ÿ” DEBUG: Entering process_book_folder with book_dir='{book_dir}', voice_path='{voice_path}'") # Apply GUI quality parameters to override config defaults if quality_params: print(f"๐Ÿ”ง Applying GUI quality parameters: {quality_params}") # Override config values with GUI settings global ENABLE_REGENERATION_LOOP, ENABLE_SENTIMENT_SMOOTHING, ENABLE_MFCC_VALIDATION global ENABLE_OUTPUT_VALIDATION, QUALITY_THRESHOLD, OUTPUT_VALIDATION_THRESHOLD global SENTIMENT_SMOOTHING_WINDOW, SENTIMENT_SMOOTHING_METHOD, SPECTRAL_ANOMALY_THRESHOLD ENABLE_REGENERATION_LOOP = quality_params.get('regeneration_enabled', ENABLE_REGENERATION_LOOP) ENABLE_SENTIMENT_SMOOTHING = quality_params.get('sentiment_smoothing', ENABLE_SENTIMENT_SMOOTHING) ENABLE_MFCC_VALIDATION = quality_params.get('mfcc_validation', ENABLE_MFCC_VALIDATION) ENABLE_OUTPUT_VALIDATION = quality_params.get('output_validation', ENABLE_OUTPUT_VALIDATION) QUALITY_THRESHOLD = quality_params.get('quality_threshold', QUALITY_THRESHOLD) OUTPUT_VALIDATION_THRESHOLD = quality_params.get('output_threshold', OUTPUT_VALIDATION_THRESHOLD) SENTIMENT_SMOOTHING_WINDOW = quality_params.get('smoothing_window', SENTIMENT_SMOOTHING_WINDOW) SENTIMENT_SMOOTHING_METHOD = quality_params.get('smoothing_method', SENTIMENT_SMOOTHING_METHOD) SPECTRAL_ANOMALY_THRESHOLD = quality_params.get('spectral_threshold', SPECTRAL_ANOMALY_THRESHOLD) print(f"โœ… Quality settings applied - Regeneration: {ENABLE_REGENERATION_LOOP}, MFCC: {ENABLE_MFCC_VALIDATION}, Output Validation: {ENABLE_OUTPUT_VALIDATION}") from src.chatterbox.tts import punc_norm print(f"๐Ÿ” DEBUG: Successfully imported punc_norm") # Setup directories print(f"๐Ÿ” DEBUG: Calling setup_book_directories...") output_root, tts_dir, text_chunks_dir, audio_chunks_dir = setup_book_directories(book_dir) print(f"๐Ÿ” DEBUG: Directory setup complete") # Clean previous processing files (but skip for resume operations) if skip_cleanup: print(f"๐Ÿ”„ RESUME MODE: Skipping cleanup to preserve existing chunks") print(f"๐Ÿ“ Preserving: {text_chunks_dir}, {audio_chunks_dir}") else: print(f"๐Ÿงน FRESH PROCESSING: Cleaning previous processing files...") import glob # Clear text chunks for txt_file in text_chunks_dir.glob("*.txt"): txt_file.unlink(missing_ok=True) for json_file in text_chunks_dir.glob("*.json"): json_file.unlink(missing_ok=True) # Clear audio chunks for wav_file in audio_chunks_dir.glob("*.wav"): wav_file.unlink(missing_ok=True) # Clear logs for log_file in output_root.glob("*.log"): log_file.unlink(missing_ok=True) print(f"โœ… Cleanup complete") # Find book files print(f"๐Ÿ” DEBUG: Calling find_book_files...") book_files = find_book_files(book_dir) # Use specific text file if provided (GUI selection), otherwise use auto-detected file if specific_text_file: text_file_to_use = Path(specific_text_file) print(f"๐ŸŽฏ DEBUG: Using GUI-selected text file: {text_file_to_use}") if not text_file_to_use.exists(): logging.error(f"[{book_dir.name}] ERROR: Selected text file not found: {text_file_to_use}") return None, None, [] else: text_file_to_use = book_files['text'] print(f"๐Ÿ” DEBUG: Using auto-detected text file: {text_file_to_use}") if not text_file_to_use: logging.info(f"[{book_dir.name}] ERROR: No .txt files found in the book folder.") return None, None, [] cover_file = book_files['cover'] nfo_file = book_files['nfo'] setup_logging(output_root) # Extract voice name for logging and JSON metadata voice_name_for_log = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem # Generate enriched chunks with VADER analysis using user parameters and GUI quality settings print(f"๐Ÿ” DEBUG: About to call generate_enriched_chunks with quality_params: {quality_params}") print(f"๐Ÿ” DEBUG: About to call generate_enriched_chunks with config_params: {config_params}") print(f"๐Ÿ” DEBUG: Using voice: {voice_name_for_log}") all_chunks = generate_enriched_chunks(text_file_to_use, text_chunks_dir, tts_params, quality_params, config_params, voice_name_for_log) # Create run_log_lines print(f"๐Ÿ” DEBUG: Creating run_log_lines...") print(f"๐Ÿ” DEBUG: voice_path type: {type(voice_path)}, value: {voice_path}") run_log_lines = [ f"\n===== Processing: {book_dir.name} =====", f"Voice: {voice_name_for_log}", f"Started: {time.strftime('%Y-%m-%d %H:%M:%S')}", f"Text file processed: {text_file_to_use.name}", f"Total chunks generated: {len(all_chunks)}" ] start_time = time.time() total_chunks = len(all_chunks) log_path = output_root / "chunk_validation.log" total_audio_duration = 0.0 # Batch processing print(f"๐Ÿ“Š Processing {total_chunks} chunks in batches of {BATCH_SIZE}") all_results = [] for batch_start in range(0, total_chunks, BATCH_SIZE): batch_end = min(batch_start + BATCH_SIZE, total_chunks) batch_chunks = all_chunks[batch_start:batch_end] print(f"\n๐Ÿ”„ Processing batch: chunks {batch_start+1}-{batch_end}") # Fresh model for each batch model = load_optimized_model(device) compatible_voice = ensure_voice_sample_compatibility(voice_path, output_dir=tts_dir) # Pre-warm model to eliminate first chunk quality variations model = prewarm_model_with_voice(model, compatible_voice, tts_params) # Load ASR model once per batch if needed using adaptive manager asr_model = None asr_device_used = None # Use parameter if provided, otherwise fall back to config asr_enabled = enable_asr if enable_asr is not None else ENABLE_ASR if asr_enabled: from modules.asr_manager import load_asr_model_adaptive # Get ASR config from parameters asr_config = config_params.get('asr_config', {}) if config_params else {} # Use adaptive ASR manager for intelligent loading asr_model, asr_device_used = load_asr_model_adaptive(asr_config) if asr_model is None: print(f"โŒ ASR model loading failed completely - disabling ASR for this batch") asr_enabled = False futures = [] batch_results = [] # Dynamic worker allocation optimal_workers = get_optimal_workers() print(f"๐Ÿ”ง Using {optimal_workers} workers for batch {batch_start+1}-{batch_end}") use_vader = tts_params.get('use_vader', True) if not use_vader: # --- BATCH MODE --- print(f"๐Ÿš€ VADER disabled. Running in high-performance batch mode.") tts_batch_size = config_params.get('tts_batch_size', 16) chunk_batches = [batch_chunks[i:i + tts_batch_size] for i in range(0, len(batch_chunks), tts_batch_size)] print(f"๐Ÿ“Š Processing {len(batch_chunks)} chunks in {len(chunk_batches)} batches of size {tts_batch_size}.") with ThreadPoolExecutor(max_workers=optimal_workers) as executor: for batch in chunk_batches: if shutdown_requested: break futures.append(executor.submit( process_batch, batch, text_chunks_dir, audio_chunks_dir, voice_path, tts_params, start_time, total_chunks, punc_norm, book_dir.name, log_run, log_path, device, model, asr_model, all_chunks, asr_enabled )) # Wait for batches to complete for fut in as_completed(futures): try: # process_batch returns a list of (idx, wav_path) tuples results_list = fut.result() for idx, wav_path in results_list: if wav_path and wav_path.exists(): chunk_duration = get_chunk_audio_duration(wav_path) total_audio_duration += chunk_duration batch_results.append((idx, wav_path)) log_chunk_progress(len(batch_results), total_chunks, start_time, total_audio_duration) except Exception as e: logging.error(f"Future failed in batch: {e}") else: # --- SINGLE/NUANCED MODE --- print(f"๐ŸŽจ VADER enabled. Running in nuanced, single-chunk mode.") with ThreadPoolExecutor(max_workers=optimal_workers) as executor: for i, chunk_data in enumerate(batch_chunks): global_chunk_index = batch_start + i # Check for shutdown request if shutdown_requested: print(f"\nโน๏ธ {YELLOW}Stopping submission of new chunks...{RESET}") break # Handle both dictionary and tuple formats for chunk data if isinstance(chunk_data, dict): chunk = chunk_data["text"] boundary_type = chunk_data.get("boundary_type", "none") # Use chunk-specific TTS params if available, otherwise fall back to global chunk_tts_params = chunk_data.get("tts_params", tts_params) else: # Handle old tuple format (text, is_para_end) - convert to boundary_type chunk = chunk_data[0] if len(chunk_data) > 0 else str(chunk_data) # Convert old is_paragraph_end to boundary_type is_old_para_end = chunk_data[1] if len(chunk_data) > 1 else False boundary_type = "paragraph_end" if is_old_para_end else "none" chunk_tts_params = tts_params # Fallback for old format futures.append(executor.submit( process_one_chunk, global_chunk_index, chunk, text_chunks_dir, audio_chunks_dir, voice_path, chunk_tts_params, start_time, total_chunks, punc_norm, book_dir.name, log_run, log_path, device, model, asr_model, boundary_type, asr_enabled )) # Wait for batch to complete print(f"๐Ÿ”„ {CYAN}Waiting for batch {batch_start+1}-{batch_end} to complete...{RESET}") completed_count = 0 for fut in as_completed(futures): try: idx, wav_path = fut.result() if wav_path and wav_path.exists(): # Measure actual audio duration for this chunk chunk_duration = get_chunk_audio_duration(wav_path) total_audio_duration += chunk_duration batch_results.append((idx, wav_path)) # Update progress every 10 chunks within batch completed_count += 1 if completed_count % 2 == 0: log_chunk_progress(batch_start + completed_count - 1, total_chunks, start_time, total_audio_duration) except Exception as e: logging.error(f"Future failed in batch: {e}") # Clean up model after batch print(f"๐Ÿงน Cleaning up after batch {batch_start+1}-{batch_end}") del model if asr_model: from modules.asr_manager import cleanup_asr_model cleanup_asr_model(asr_model) torch.cuda.empty_cache() gc.collect() time.sleep(2) all_results.extend(batch_results) print(f"โœ… Batch {batch_start+1}-{batch_end} completed ({len(batch_results)} chunks)") # Final processing quarantine_dir = audio_chunks_dir / "quarantine" pause_for_chunk_review(quarantine_dir) # Collect final chunk paths chunk_paths = get_audio_files_in_directory(audio_chunks_dir) if not chunk_paths: logging.info(f"{RED}โŒ No valid audio chunks found. Skipping concatenation and conversion.{RESET}") return None, None, [] # Calculate timing elapsed_total = time.time() - start_time elapsed_td = timedelta(seconds=int(elapsed_total)) total_audio_duration_final = sum(get_chunk_audio_duration(chunk_path) for chunk_path in chunk_paths) audio_duration_td = timedelta(seconds=int(total_audio_duration_final)) realtime_factor = total_audio_duration_final / elapsed_total if elapsed_total > 0 else 0.0 print(f"\nโฑ๏ธ TTS Processing Complete:") print(f" Elapsed Time: {CYAN}{str(elapsed_td)}{RESET}") print(f" Audio Duration: {GREEN}{str(audio_duration_td)}{RESET}") print(f" Realtime Factor: {YELLOW}{realtime_factor:.2f}x{RESET}") # Combine audio voice_name = voice_path.stem if hasattr(voice_path, 'stem') else Path(voice_path).stem combined_wav_path = output_root / f"{book_dir.name} [{voice_name}].wav" print("\n๐Ÿ’พ Saving WAV file...") combine_audio_chunks(chunk_paths, combined_wav_path) # M4B conversion with normalization temp_m4b_path = output_root / "output.m4b" final_m4b_path = output_root / f"{book_dir.name}[{voice_name}].m4b" convert_to_m4b(combined_wav_path, temp_m4b_path) add_metadata_to_m4b(temp_m4b_path, final_m4b_path, cover_file, nfo_file) logging.info(f"Audiobook created: {final_m4b_path}") # Add final info to run log run_log_lines.extend([ f"Combined WAV: {combined_wav_path}", "--- Generation Settings ---", f"Batch Processing: Enabled ({BATCH_SIZE} chunks per batch)", f"ASR Enabled: {ENABLE_ASR}", f"Hum Detection: {ENABLE_HUM_DETECTION}", f"Dynamic Workers: {USE_DYNAMIC_WORKERS}", f"Voice used: {voice_name}", f"Exaggeration: {tts_params['exaggeration']}", f"CFG weight: {tts_params['cfg_weight']}", f"Temperature: {tts_params['temperature']}", f"Processing Time: {str(elapsed_td)}", f"Audio Duration: {str(audio_duration_td)}", f"Realtime Factor: {realtime_factor:.2f}x", f"Total Chunks: {len(chunk_paths)}" ]) # Write the run log log_run("\n".join(run_log_lines), output_root / "run.log") print(f"๐Ÿ“ Run log written to: {output_root / 'run.log'}") return final_m4b_path, combined_wav_path, run_log_lines