""" Voice Cloner Module Main API for few-shot voice cloning using XTTS v2 """ import torch import numpy as np import soundfile as sf import librosa from pathlib import Path from typing import Optional, Union, Tuple import warnings import os warnings.filterwarnings('ignore') # Set environment variable to agree to TTS license for non-commercial use os.environ['COQUI_TOS_AGREED'] = '1' # Fix PyTorch 2.6+ weights_only issue - disable weights_only for TTS models import torch # Monkey patch torch.load to use weights_only=False for compatibility _original_torch_load = torch.load def _patched_torch_load(*args, **kwargs): kwargs.setdefault('weights_only', False) return _original_torch_load(*args, **kwargs) torch.load = _patched_torch_load try: from TTS.api import TTS except ImportError: print("Warning: TTS not installed. Run: pip install TTS") TTS = None class VoiceCloner: """ Few-shot voice cloning system using XTTS v2 Features: - Clone any voice with 5-30 seconds of reference audio - Multi-speaker support - Real-time inference optimized for RTX 5060 Ti - Mixed precision (FP16) support """ def __init__( self, model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2", device: str = "cuda", use_fp16: bool = True, cache_dir: Optional[str] = None ): """ Initialize the Voice Cloner Args: model_name: TTS model name (default: XTTS v2) device: Device to run on ('cuda' or 'cpu') use_fp16: Use mixed precision for faster inference cache_dir: Directory to cache models """ if TTS is None: raise ImportError("TTS library not installed. Run: pip install TTS") self.device = device if torch.cuda.is_available() else "cpu" self.use_fp16 = use_fp16 and self.device == "cuda" print(f"šŸš€ Initializing Voice Cloner on {self.device}...") print(f" Model: {model_name}") print(f" Mixed Precision (FP16): {self.use_fp16}") # Initialize TTS model try: self.tts = TTS( model_name=model_name, gpu=(self.device == "cuda") ) # Move to device if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_model'): self.tts.synthesizer.tts_model.to(self.device) # Enable FP16 if requested if self.use_fp16: self.tts.synthesizer.tts_model.half() print(" āœ“ FP16 enabled") print("āœ“ Voice Cloner initialized successfully!") except Exception as e: print(f"āŒ Error initializing TTS model: {e}") raise def clone_voice( self, text: str, reference_audio_path: Union[str, Path], language: str = "en", output_path: Optional[Union[str, Path]] = None, speed: float = 1.0 ) -> Tuple[np.ndarray, int]: """ Clone a voice and synthesize speech Args: text: Text to synthesize reference_audio_path: Path to reference audio (5-30s recommended) language: Language code ('en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko') output_path: Optional path to save output audio speed: Speech speed multiplier (default: 1.0) Returns: Tuple of (audio_array, sample_rate) """ # Validate inputs if not text or len(text.strip()) == 0: raise ValueError("Text cannot be empty") if len(text) > 1000: warnings.warn("Text is very long (>1000 chars). Consider splitting for better quality.") reference_audio_path = Path(reference_audio_path) if not reference_audio_path.exists(): raise FileNotFoundError(f"Reference audio not found: {reference_audio_path}") print(f"šŸŽ¤ Cloning voice from: {reference_audio_path.name}") print(f"šŸ“ Text length: {len(text)} characters") print(f"šŸŒ Language: {language}") try: # Synthesize speech with torch.cuda.amp.autocast(enabled=self.use_fp16): wav = self.tts.tts( text=text, speaker_wav=str(reference_audio_path), language=language, speed=speed ) # Convert to numpy array if isinstance(wav, torch.Tensor): wav = wav.cpu().numpy() elif isinstance(wav, list): wav = np.array(wav) # Get sample rate sample_rate = self.tts.synthesizer.output_sample_rate # Save if output path provided if output_path: self.save_audio(wav, output_path, sample_rate) print(f"āœ“ Audio saved to: {output_path}") print(f"āœ“ Synthesis complete! Duration: {len(wav)/sample_rate:.2f}s") return wav, sample_rate except Exception as e: print(f"āŒ Error during synthesis: {e}") raise def clone_multiple_speakers( self, text: str, speaker_references: dict, language: str = "en", output_dir: Optional[Union[str, Path]] = None ) -> dict: """ Synthesize the same text in multiple voices Args: text: Text to synthesize speaker_references: Dict mapping speaker names to reference audio paths language: Language code output_dir: Directory to save outputs Returns: Dict mapping speaker names to (audio_array, sample_rate) tuples """ results = {} if output_dir: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) print(f"šŸŽ­ Synthesizing for {len(speaker_references)} speakers...") for speaker_name, ref_path in speaker_references.items(): print(f"\n--- Speaker: {speaker_name} ---") output_path = None if output_dir: output_path = output_dir / f"{speaker_name}.wav" try: wav, sr = self.clone_voice( text=text, reference_audio_path=ref_path, language=language, output_path=output_path ) results[speaker_name] = (wav, sr) except Exception as e: print(f"āŒ Failed for {speaker_name}: {e}") results[speaker_name] = None print(f"\nāœ“ Completed {len([r for r in results.values() if r is not None])}/{len(speaker_references)} speakers") return results @staticmethod def save_audio( audio: np.ndarray, output_path: Union[str, Path], sample_rate: int = 24000 ): """ Save audio to file Args: audio: Audio array output_path: Output file path sample_rate: Sample rate (default: 24000 Hz) """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) # Normalize audio to prevent clipping audio = np.clip(audio, -1.0, 1.0) sf.write(str(output_path), audio, sample_rate) @staticmethod def load_audio( audio_path: Union[str, Path], target_sr: int = 24000 ) -> Tuple[np.ndarray, int]: """ Load and resample audio Args: audio_path: Path to audio file target_sr: Target sample rate Returns: Tuple of (audio_array, sample_rate) """ audio, sr = librosa.load(str(audio_path), sr=target_sr) return audio, sr def get_model_info(self) -> dict: """ Get information about the loaded model Returns: Dict with model information """ info = { "model_name": "XTTS v2", "device": self.device, "fp16_enabled": self.use_fp16, "sample_rate": self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 24000, } # Get VRAM usage if on CUDA if self.device == "cuda": info["vram_allocated_gb"] = torch.cuda.memory_allocated() / 1e9 info["vram_reserved_gb"] = torch.cuda.memory_reserved() / 1e9 return info def __repr__(self): return f"VoiceCloner(device={self.device}, fp16={self.use_fp16})" def main(): """Demo usage of VoiceCloner""" print("=" * 60) print("Voice Cloner Demo") print("=" * 60) # Initialize cloner = VoiceCloner(device="cuda", use_fp16=True) # Print model info print("\nšŸ“Š Model Information:") info = cloner.get_model_info() for key, value in info.items(): print(f" {key}: {value}") print("\n" + "=" * 60) print("Ready to clone voices!") print("=" * 60) if __name__ == "__main__": main()