"""
Voice Cloner Module
Main API for few-shot voice cloning using XTTS v2
"""

import torch
import numpy as np
import soundfile as sf
import librosa
from pathlib import Path
from typing import Optional, Union, Tuple
import warnings
import os
warnings.filterwarnings('ignore')

# Set environment variable to agree to TTS license for non-commercial use
os.environ['COQUI_TOS_AGREED'] = '1'

# Fix PyTorch 2.6+ weights_only issue - disable weights_only for TTS models
import torch
# Monkey patch torch.load to use weights_only=False for compatibility
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
    kwargs.setdefault('weights_only', False)
    return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load

try:
    from TTS.api import TTS
except ImportError:
    print("Warning: TTS not installed. Run: pip install TTS")
    TTS = None


class VoiceCloner:
    """
    Few-shot voice cloning system using XTTS v2
    
    Features:
    - Clone any voice with 5-30 seconds of reference audio
    - Multi-speaker support
    - Real-time inference optimized for RTX 5060 Ti
    - Mixed precision (FP16) support
    """
    
    def __init__(
        self,
        model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2",
        device: str = "cuda",
        use_fp16: bool = True,
        cache_dir: Optional[str] = None
    ):
        """
        Initialize the Voice Cloner
        
        Args:
            model_name: TTS model name (default: XTTS v2)
            device: Device to run on ('cuda' or 'cpu')
            use_fp16: Use mixed precision for faster inference
            cache_dir: Directory to cache models
        """
        if TTS is None:
            raise ImportError("TTS library not installed. Run: pip install TTS")
        
        self.device = device if torch.cuda.is_available() else "cpu"
        self.use_fp16 = use_fp16 and self.device == "cuda"
        
        print(f"🚀 Initializing Voice Cloner on {self.device}...")
        print(f"   Model: {model_name}")
        print(f"   Mixed Precision (FP16): {self.use_fp16}")
        
        # Initialize TTS model
        try:
            self.tts = TTS(
                model_name=model_name,
                gpu=(self.device == "cuda")
            )
            
            # Move to device
            if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_model'):
                self.tts.synthesizer.tts_model.to(self.device)
                
                # Enable FP16 if requested
                if self.use_fp16:
                    self.tts.synthesizer.tts_model.half()
                    print("   ✓ FP16 enabled")
            
            print("✓ Voice Cloner initialized successfully!")
            
        except Exception as e:
            print(f"❌ Error initializing TTS model: {e}")
            raise
    
    def clone_voice(
        self,
        text: str,
        reference_audio_path: Union[str, Path],
        language: str = "en",
        output_path: Optional[Union[str, Path]] = None,
        speed: float = 1.0
    ) -> Tuple[np.ndarray, int]:
        """
        Clone a voice and synthesize speech
        
        Args:
            text: Text to synthesize
            reference_audio_path: Path to reference audio (5-30s recommended)
            language: Language code ('en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko')
            output_path: Optional path to save output audio
            speed: Speech speed multiplier (default: 1.0)
        
        Returns:
            Tuple of (audio_array, sample_rate)
        """
        # Validate inputs
        if not text or len(text.strip()) == 0:
            raise ValueError("Text cannot be empty")
        
        if len(text) > 1000:
            warnings.warn("Text is very long (>1000 chars). Consider splitting for better quality.")
        
        reference_audio_path = Path(reference_audio_path)
        if not reference_audio_path.exists():
            raise FileNotFoundError(f"Reference audio not found: {reference_audio_path}")
        
        print(f"🎤 Cloning voice from: {reference_audio_path.name}")
        print(f"📝 Text length: {len(text)} characters")
        print(f"🌍 Language: {language}")
        
        try:
            # Synthesize speech
            with torch.cuda.amp.autocast(enabled=self.use_fp16):
                wav = self.tts.tts(
                    text=text,
                    speaker_wav=str(reference_audio_path),
                    language=language,
                    speed=speed
                )
            
            # Convert to numpy array
            if isinstance(wav, torch.Tensor):
                wav = wav.cpu().numpy()
            elif isinstance(wav, list):
                wav = np.array(wav)
            
            # Get sample rate
            sample_rate = self.tts.synthesizer.output_sample_rate
            
            # Save if output path provided
            if output_path:
                self.save_audio(wav, output_path, sample_rate)
                print(f"✓ Audio saved to: {output_path}")
            
            print(f"✓ Synthesis complete! Duration: {len(wav)/sample_rate:.2f}s")
            
            return wav, sample_rate
            
        except Exception as e:
            print(f"❌ Error during synthesis: {e}")
            raise
    
    def clone_multiple_speakers(
        self,
        text: str,
        speaker_references: dict,
        language: str = "en",
        output_dir: Optional[Union[str, Path]] = None
    ) -> dict:
        """
        Synthesize the same text in multiple voices
        
        Args:
            text: Text to synthesize
            speaker_references: Dict mapping speaker names to reference audio paths
            language: Language code
            output_dir: Directory to save outputs
        
        Returns:
            Dict mapping speaker names to (audio_array, sample_rate) tuples
        """
        results = {}
        
        if output_dir:
            output_dir = Path(output_dir)
            output_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"🎭 Synthesizing for {len(speaker_references)} speakers...")
        
        for speaker_name, ref_path in speaker_references.items():
            print(f"\n--- Speaker: {speaker_name} ---")
            
            output_path = None
            if output_dir:
                output_path = output_dir / f"{speaker_name}.wav"
            
            try:
                wav, sr = self.clone_voice(
                    text=text,
                    reference_audio_path=ref_path,
                    language=language,
                    output_path=output_path
                )
                results[speaker_name] = (wav, sr)
                
            except Exception as e:
                print(f"❌ Failed for {speaker_name}: {e}")
                results[speaker_name] = None
        
        print(f"\n✓ Completed {len([r for r in results.values() if r is not None])}/{len(speaker_references)} speakers")
        return results
    
    @staticmethod
    def save_audio(
        audio: np.ndarray,
        output_path: Union[str, Path],
        sample_rate: int = 24000
    ):
        """
        Save audio to file
        
        Args:
            audio: Audio array
            output_path: Output file path
            sample_rate: Sample rate (default: 24000 Hz)
        """
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Normalize audio to prevent clipping
        audio = np.clip(audio, -1.0, 1.0)
        
        sf.write(str(output_path), audio, sample_rate)
    
    @staticmethod
    def load_audio(
        audio_path: Union[str, Path],
        target_sr: int = 24000
    ) -> Tuple[np.ndarray, int]:
        """
        Load and resample audio
        
        Args:
            audio_path: Path to audio file
            target_sr: Target sample rate
        
        Returns:
            Tuple of (audio_array, sample_rate)
        """
        audio, sr = librosa.load(str(audio_path), sr=target_sr)
        return audio, sr
    
    def get_model_info(self) -> dict:
        """
        Get information about the loaded model
        
        Returns:
            Dict with model information
        """
        info = {
            "model_name": "XTTS v2",
            "device": self.device,
            "fp16_enabled": self.use_fp16,
            "sample_rate": self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 24000,
        }
        
        # Get VRAM usage if on CUDA
        if self.device == "cuda":
            info["vram_allocated_gb"] = torch.cuda.memory_allocated() / 1e9
            info["vram_reserved_gb"] = torch.cuda.memory_reserved() / 1e9
        
        return info
    
    def __repr__(self):
        return f"VoiceCloner(device={self.device}, fp16={self.use_fp16})"


def main():
    """Demo usage of VoiceCloner"""
    print("=" * 60)
    print("Voice Cloner Demo")
    print("=" * 60)
    
    # Initialize
    cloner = VoiceCloner(device="cuda", use_fp16=True)
    
    # Print model info
    print("\n📊 Model Information:")
    info = cloner.get_model_info()
    for key, value in info.items():
        print(f"   {key}: {value}")
    
    print("\n" + "=" * 60)
    print("Ready to clone voices!")
    print("=" * 60)


if __name__ == "__main__":
    main()