""" IndicF5-Hinglish: Custom model wrapper for HuggingFace compatibility. This module provides a custom model class that can be loaded via: model = AutoModel.from_pretrained("Saravananravi/indicf5-hinglish") Usage: from indicf5_hinglish import IndicF5Hinglish model = IndicF5Hinglish.from_pretrained("Saravananravi/indicf5-hinglish") audio = model.generate("मैं आज office जा रहा हूँ", ref_audio="ref.wav", ref_text="reference text") """ import os import torch import numpy as np import soundfile as sf # Configuration MODEL_CONFIG = { "dim": 1024, "depth": 22, "heads": 16, "ff_mult": 2, "text_dim": 512, "conv_layers": 4, "text_num_embeds": 2546, "mel_dim": 100, "n_fft": 1024, "hop_length": 256, "win_length": 1024, "n_mel_channels": 100, "target_sample_rate": 24000, } SAMPLE_RATE = 24000 class IndicF5Hinglish(torch.nn.Module): """IndicF5 fine-tuned for Hindi-English code-switched TTS.""" def __init__(self, config=None): super().__init__() self.config = config or MODEL_CONFIG # Import F5-TTS components (requires f5_tts package) try: from f5_tts.model import CFM, DiT from f5_tts.model.utils import get_tokenizer self.CFM = CFM self.DiT = DiT self.get_tokenizer = get_tokenizer except ImportError: raise ImportError( "f5_tts is required. Install with: pip install f5-tts or clone from " "https://github.com/Saravananravi08/indicf5-finetune" ) # Build model backbone = DiT( dim=self.config["dim"], depth=self.config["depth"], heads=self.config["heads"], ff_mult=self.config["ff_mult"], text_dim=self.config["text_dim"], conv_layers=self.config["conv_layers"], text_num_embeds=self.config["text_num_embeds"], mel_dim=self.config["mel_dim"], ) self.model = CFM( transformer=backbone, mel_spec_kwargs=dict( n_fft=self.config["n_fft"], hop_length=self.config["hop_length"], win_length=self.config["win_length"], n_mel_channels=self.config["n_mel_channels"], target_sample_rate=self.config["target_sample_rate"], mel_spec_type="vocos", ), odeint_kwargs=dict(method="euler"), ) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): """Load a fine-tuned IndicF5-Hinglish checkpoint.""" from huggingface_hub import hf_hub_download, snapshot_download from safetensors.torch import load_file model = cls() # Determine checkpoint path if os.path.isdir(pretrained_model_name_or_path): ckpt_path = os.path.join(pretrained_model_name_or_path, "model.safetensors") if not os.path.exists(ckpt_path): ckpt_path = os.path.join(pretrained_model_name_or_path, "model_last.pt") else: # Try to download from HF try: ckpt_path = hf_hub_download( repo_id=pretrained_model_name_or_path, filename="model.safetensors", ) except: ckpt_path = hf_hub_download( repo_id=pretrained_model_name_or_path, filename="model_last.pt", ) # Load checkpoint if ckpt_path.endswith(".safetensors"): state_dict = load_file(ckpt_path) else: checkpoint = torch.load(ckpt_path, weights_only=True, map_location="cpu") if "ema_model_state_dict" in checkpoint: state_dict = checkpoint["ema_model_state_dict"] # Clean prefixes cleaned = {} for k, v in state_dict.items(): if k.startswith("ema_model."): cleaned[k[10:]] = v elif k in ("initted", "step"): continue else: cleaned[k] = v state_dict = cleaned model.model.load_state_dict(state_dict, strict=False) device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) model.eval() return model def generate(self, text, ref_audio=None, ref_text=None, nfe_step=16, speed=1.0): """ Generate speech from text. Args: text: Input text (Hinglish/Hindi/English) ref_audio: Path to reference audio for voice cloning ref_text: Transcript of reference audio nfe_step: Number of NFEs (16=fast, 32=quality) speed: Speech speed (1.0 = normal) Returns: audio: numpy array of audio samples sr: sample rate """ if ref_audio is None or ref_text is None: raise ValueError("Reference audio and text are required") from f5_tts.infer.utils_infer import ( infer_process, load_vocoder, preprocess_ref_audio_text ) device = next(self.parameters()).device # Load vocoder vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device) # Preprocess reference ref_audio_arr, ref_text_proc = preprocess_ref_audio_text( ref_audio, ref_text, device=device ) # Generate audio, sr, _ = infer_process( ref_audio_arr, ref_text_proc, text, self.model, vocoder, mel_spec_type="vocos", speed=speed, device=device, nfe_step=nfe_step, show_info=lambda *a: None, ) return np.array(audio, dtype=np.float32), sr def main(): """Example usage.""" import argparse parser = argparse.ArgumentParser(description="IndicF5-Hinglish TTS") parser.add_argument("--text", required=True, help="Text to synthesize") parser.add_argument("--ref-audio", required=True, help="Reference audio file") parser.add_argument("--ref-text", required=True, help="Reference audio transcript") parser.add_argument("--output", default="output.wav", help="Output audio file") parser.add_argument("--model", default="Saravananravi/indicf5-hinglish", help="Model name or path") parser.add_argument("--nfe-step", type=int, default=16, help="NFE steps") args = parser.parse_args() print(f"Loading model: {args.model}") model = IndicF5Hinglish.from_pretrained(args.model) print(f"Generating: {args.text}") audio, sr = model.generate( args.text, ref_audio=args.ref_audio, ref_text=args.ref_text, nfe_step=args.nfe_step ) sf.write(args.output, audio, sr) print(f"Saved to: {args.output} ({len(audio)/sr:.2f}s)") if __name__ == "__main__": main()