Spaces:
Runtime error
Runtime error
| """ | |
| Voice Cloner Module | |
| Main API for few-shot voice cloning using XTTS v2 | |
| """ | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| import librosa | |
| from pathlib import Path | |
| from typing import Optional, Union, Tuple | |
| import warnings | |
| import os | |
| warnings.filterwarnings('ignore') | |
| # Set environment variable to agree to TTS license for non-commercial use | |
| os.environ['COQUI_TOS_AGREED'] = '1' | |
| # Fix PyTorch 2.6+ weights_only issue - disable weights_only for TTS models | |
| import torch | |
| # Monkey patch torch.load to use weights_only=False for compatibility | |
| _original_torch_load = torch.load | |
| def _patched_torch_load(*args, **kwargs): | |
| kwargs.setdefault('weights_only', False) | |
| return _original_torch_load(*args, **kwargs) | |
| torch.load = _patched_torch_load | |
| try: | |
| from TTS.api import TTS | |
| except ImportError: | |
| print("Warning: TTS not installed. Run: pip install TTS") | |
| TTS = None | |
| class VoiceCloner: | |
| """ | |
| Few-shot voice cloning system using XTTS v2 | |
| Features: | |
| - Clone any voice with 5-30 seconds of reference audio | |
| - Multi-speaker support | |
| - Real-time inference optimized for RTX 5060 Ti | |
| - Mixed precision (FP16) support | |
| """ | |
| def __init__( | |
| self, | |
| model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2", | |
| device: str = "cuda", | |
| use_fp16: bool = True, | |
| cache_dir: Optional[str] = None | |
| ): | |
| """ | |
| Initialize the Voice Cloner | |
| Args: | |
| model_name: TTS model name (default: XTTS v2) | |
| device: Device to run on ('cuda' or 'cpu') | |
| use_fp16: Use mixed precision for faster inference | |
| cache_dir: Directory to cache models | |
| """ | |
| if TTS is None: | |
| raise ImportError("TTS library not installed. Run: pip install TTS") | |
| self.device = device if torch.cuda.is_available() else "cpu" | |
| self.use_fp16 = use_fp16 and self.device == "cuda" | |
| print(f"🚀 Initializing Voice Cloner on {self.device}...") | |
| print(f" Model: {model_name}") | |
| print(f" Mixed Precision (FP16): {self.use_fp16}") | |
| # Initialize TTS model | |
| try: | |
| self.tts = TTS( | |
| model_name=model_name, | |
| gpu=(self.device == "cuda") | |
| ) | |
| # Move to device | |
| if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_model'): | |
| self.tts.synthesizer.tts_model.to(self.device) | |
| # Enable FP16 if requested | |
| if self.use_fp16: | |
| self.tts.synthesizer.tts_model.half() | |
| print(" ✓ FP16 enabled") | |
| print("✓ Voice Cloner initialized successfully!") | |
| except Exception as e: | |
| print(f"❌ Error initializing TTS model: {e}") | |
| raise | |
| def clone_voice( | |
| self, | |
| text: str, | |
| reference_audio_path: Union[str, Path], | |
| language: str = "en", | |
| output_path: Optional[Union[str, Path]] = None, | |
| speed: float = 1.0 | |
| ) -> Tuple[np.ndarray, int]: | |
| """ | |
| Clone a voice and synthesize speech | |
| Args: | |
| text: Text to synthesize | |
| reference_audio_path: Path to reference audio (5-30s recommended) | |
| language: Language code ('en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko') | |
| output_path: Optional path to save output audio | |
| speed: Speech speed multiplier (default: 1.0) | |
| Returns: | |
| Tuple of (audio_array, sample_rate) | |
| """ | |
| # Validate inputs | |
| if not text or len(text.strip()) == 0: | |
| raise ValueError("Text cannot be empty") | |
| if len(text) > 1000: | |
| warnings.warn("Text is very long (>1000 chars). Consider splitting for better quality.") | |
| reference_audio_path = Path(reference_audio_path) | |
| if not reference_audio_path.exists(): | |
| raise FileNotFoundError(f"Reference audio not found: {reference_audio_path}") | |
| print(f"🎤 Cloning voice from: {reference_audio_path.name}") | |
| print(f"📝 Text length: {len(text)} characters") | |
| print(f"🌍 Language: {language}") | |
| try: | |
| # Synthesize speech | |
| with torch.cuda.amp.autocast(enabled=self.use_fp16): | |
| wav = self.tts.tts( | |
| text=text, | |
| speaker_wav=str(reference_audio_path), | |
| language=language, | |
| speed=speed | |
| ) | |
| # Convert to numpy array | |
| if isinstance(wav, torch.Tensor): | |
| wav = wav.cpu().numpy() | |
| elif isinstance(wav, list): | |
| wav = np.array(wav) | |
| # Get sample rate | |
| sample_rate = self.tts.synthesizer.output_sample_rate | |
| # Save if output path provided | |
| if output_path: | |
| self.save_audio(wav, output_path, sample_rate) | |
| print(f"✓ Audio saved to: {output_path}") | |
| print(f"✓ Synthesis complete! Duration: {len(wav)/sample_rate:.2f}s") | |
| return wav, sample_rate | |
| except Exception as e: | |
| print(f"❌ Error during synthesis: {e}") | |
| raise | |
| def clone_multiple_speakers( | |
| self, | |
| text: str, | |
| speaker_references: dict, | |
| language: str = "en", | |
| output_dir: Optional[Union[str, Path]] = None | |
| ) -> dict: | |
| """ | |
| Synthesize the same text in multiple voices | |
| Args: | |
| text: Text to synthesize | |
| speaker_references: Dict mapping speaker names to reference audio paths | |
| language: Language code | |
| output_dir: Directory to save outputs | |
| Returns: | |
| Dict mapping speaker names to (audio_array, sample_rate) tuples | |
| """ | |
| results = {} | |
| if output_dir: | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| print(f"🎭 Synthesizing for {len(speaker_references)} speakers...") | |
| for speaker_name, ref_path in speaker_references.items(): | |
| print(f"\n--- Speaker: {speaker_name} ---") | |
| output_path = None | |
| if output_dir: | |
| output_path = output_dir / f"{speaker_name}.wav" | |
| try: | |
| wav, sr = self.clone_voice( | |
| text=text, | |
| reference_audio_path=ref_path, | |
| language=language, | |
| output_path=output_path | |
| ) | |
| results[speaker_name] = (wav, sr) | |
| except Exception as e: | |
| print(f"❌ Failed for {speaker_name}: {e}") | |
| results[speaker_name] = None | |
| print(f"\n✓ Completed {len([r for r in results.values() if r is not None])}/{len(speaker_references)} speakers") | |
| return results | |
| def save_audio( | |
| audio: np.ndarray, | |
| output_path: Union[str, Path], | |
| sample_rate: int = 24000 | |
| ): | |
| """ | |
| Save audio to file | |
| Args: | |
| audio: Audio array | |
| output_path: Output file path | |
| sample_rate: Sample rate (default: 24000 Hz) | |
| """ | |
| output_path = Path(output_path) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| # Normalize audio to prevent clipping | |
| audio = np.clip(audio, -1.0, 1.0) | |
| sf.write(str(output_path), audio, sample_rate) | |
| def load_audio( | |
| audio_path: Union[str, Path], | |
| target_sr: int = 24000 | |
| ) -> Tuple[np.ndarray, int]: | |
| """ | |
| Load and resample audio | |
| Args: | |
| audio_path: Path to audio file | |
| target_sr: Target sample rate | |
| Returns: | |
| Tuple of (audio_array, sample_rate) | |
| """ | |
| audio, sr = librosa.load(str(audio_path), sr=target_sr) | |
| return audio, sr | |
| def get_model_info(self) -> dict: | |
| """ | |
| Get information about the loaded model | |
| Returns: | |
| Dict with model information | |
| """ | |
| info = { | |
| "model_name": "XTTS v2", | |
| "device": self.device, | |
| "fp16_enabled": self.use_fp16, | |
| "sample_rate": self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 24000, | |
| } | |
| # Get VRAM usage if on CUDA | |
| if self.device == "cuda": | |
| info["vram_allocated_gb"] = torch.cuda.memory_allocated() / 1e9 | |
| info["vram_reserved_gb"] = torch.cuda.memory_reserved() / 1e9 | |
| return info | |
| def __repr__(self): | |
| return f"VoiceCloner(device={self.device}, fp16={self.use_fp16})" | |
| def main(): | |
| """Demo usage of VoiceCloner""" | |
| print("=" * 60) | |
| print("Voice Cloner Demo") | |
| print("=" * 60) | |
| # Initialize | |
| cloner = VoiceCloner(device="cuda", use_fp16=True) | |
| # Print model info | |
| print("\n📊 Model Information:") | |
| info = cloner.get_model_info() | |
| for key, value in info.items(): | |
| print(f" {key}: {value}") | |
| print("\n" + "=" * 60) | |
| print("Ready to clone voices!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |