"""
IndicF5-Hinglish: Custom model wrapper for HuggingFace compatibility.

This module provides a custom model class that can be loaded via:
    model = AutoModel.from_pretrained("Saravananravi/indicf5-hinglish")

Usage:
    from indicf5_hinglish import IndicF5Hinglish
    
    model = IndicF5Hinglish.from_pretrained("Saravananravi/indicf5-hinglish")
    audio = model.generate("मैं आज office जा रहा हूँ", ref_audio="ref.wav", ref_text="reference text")
"""

import os
import torch
import numpy as np
import soundfile as sf

# Configuration
MODEL_CONFIG = {
    "dim": 1024,
    "depth": 22,
    "heads": 16,
    "ff_mult": 2,
    "text_dim": 512,
    "conv_layers": 4,
    "text_num_embeds": 2546,
    "mel_dim": 100,
    "n_fft": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mel_channels": 100,
    "target_sample_rate": 24000,
}

SAMPLE_RATE = 24000

class IndicF5Hinglish(torch.nn.Module):
    """IndicF5 fine-tuned for Hindi-English code-switched TTS."""
    
    def __init__(self, config=None):
        super().__init__()
        self.config = config or MODEL_CONFIG
        
        # Import F5-TTS components (requires f5_tts package)
        try:
            from f5_tts.model import CFM, DiT
            from f5_tts.model.utils import get_tokenizer
            self.CFM = CFM
            self.DiT = DiT
            self.get_tokenizer = get_tokenizer
        except ImportError:
            raise ImportError(
                "f5_tts is required. Install with: pip install f5-tts or clone from "
                "https://github.com/Saravananravi08/indicf5-finetune"
            )
        
        # Build model
        backbone = DiT(
            dim=self.config["dim"],
            depth=self.config["depth"],
            heads=self.config["heads"],
            ff_mult=self.config["ff_mult"],
            text_dim=self.config["text_dim"],
            conv_layers=self.config["conv_layers"],
            text_num_embeds=self.config["text_num_embeds"],
            mel_dim=self.config["mel_dim"],
        )
        
        self.model = CFM(
            transformer=backbone,
            mel_spec_kwargs=dict(
                n_fft=self.config["n_fft"],
                hop_length=self.config["hop_length"],
                win_length=self.config["win_length"],
                n_mel_channels=self.config["n_mel_channels"],
                target_sample_rate=self.config["target_sample_rate"],
                mel_spec_type="vocos",
            ),
            odeint_kwargs=dict(method="euler"),
        )
    
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
        """Load a fine-tuned IndicF5-Hinglish checkpoint."""
        from huggingface_hub import hf_hub_download, snapshot_download
        from safetensors.torch import load_file
        
        model = cls()
        
        # Determine checkpoint path
        if os.path.isdir(pretrained_model_name_or_path):
            ckpt_path = os.path.join(pretrained_model_name_or_path, "model.safetensors")
            if not os.path.exists(ckpt_path):
                ckpt_path = os.path.join(pretrained_model_name_or_path, "model_last.pt")
        else:
            # Try to download from HF
            try:
                ckpt_path = hf_hub_download(
                    repo_id=pretrained_model_name_or_path,
                    filename="model.safetensors",
                )
            except:
                ckpt_path = hf_hub_download(
                    repo_id=pretrained_model_name_or_path,
                    filename="model_last.pt",
                )
        
        # Load checkpoint
        if ckpt_path.endswith(".safetensors"):
            state_dict = load_file(ckpt_path)
        else:
            checkpoint = torch.load(ckpt_path, weights_only=True, map_location="cpu")
            if "ema_model_state_dict" in checkpoint:
                state_dict = checkpoint["ema_model_state_dict"]
                # Clean prefixes
                cleaned = {}
                for k, v in state_dict.items():
                    if k.startswith("ema_model."):
                        cleaned[k[10:]] = v
                    elif k in ("initted", "step"):
                        continue
                    else:
                        cleaned[k] = v
                state_dict = cleaned
        
        model.model.load_state_dict(state_dict, strict=False)
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        model.eval()
        
        return model
    
    def generate(self, text, ref_audio=None, ref_text=None, nfe_step=16, speed=1.0):
        """
        Generate speech from text.
        
        Args:
            text: Input text (Hinglish/Hindi/English)
            ref_audio: Path to reference audio for voice cloning
            ref_text: Transcript of reference audio
            nfe_step: Number of NFEs (16=fast, 32=quality)
            speed: Speech speed (1.0 = normal)
        
        Returns:
            audio: numpy array of audio samples
            sr: sample rate
        """
        if ref_audio is None or ref_text is None:
            raise ValueError("Reference audio and text are required")
        
        from f5_tts.infer.utils_infer import (
            infer_process, load_vocoder, preprocess_ref_audio_text
        )
        
        device = next(self.parameters()).device
        
        # Load vocoder
        vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device)
        
        # Preprocess reference
        ref_audio_arr, ref_text_proc = preprocess_ref_audio_text(
            ref_audio, ref_text, device=device
        )
        
        # Generate
        audio, sr, _ = infer_process(
            ref_audio_arr, ref_text_proc, text,
            self.model, vocoder,
            mel_spec_type="vocos",
            speed=speed, device=device, nfe_step=nfe_step,
            show_info=lambda *a: None,
        )
        
        return np.array(audio, dtype=np.float32), sr


def main():
    """Example usage."""
    import argparse
    
    parser = argparse.ArgumentParser(description="IndicF5-Hinglish TTS")
    parser.add_argument("--text", required=True, help="Text to synthesize")
    parser.add_argument("--ref-audio", required=True, help="Reference audio file")
    parser.add_argument("--ref-text", required=True, help="Reference audio transcript")
    parser.add_argument("--output", default="output.wav", help="Output audio file")
    parser.add_argument("--model", default="Saravananravi/indicf5-hinglish", 
                        help="Model name or path")
    parser.add_argument("--nfe-step", type=int, default=16, help="NFE steps")
    args = parser.parse_args()
    
    print(f"Loading model: {args.model}")
    model = IndicF5Hinglish.from_pretrained(args.model)
    
    print(f"Generating: {args.text}")
    audio, sr = model.generate(
        args.text, 
        ref_audio=args.ref_audio, 
        ref_text=args.ref_text,
        nfe_step=args.nfe_step
    )
    
    sf.write(args.output, audio, sr)
    print(f"Saved to: {args.output} ({len(audio)/sr:.2f}s)")


if __name__ == "__main__":
    main()