Spaces:

saadmannan
/

TTS-with-VoiceCloning

Runtime error

App Files Files Community

TTS-with-VoiceCloning / src /voice_cloner.py

saadmannan

initial commit

5ffccae 7 months ago

raw

history blame contribute delete

9.58 kB

	"""
	Voice Cloner Module
	Main API for few-shot voice cloning using XTTS v2
	"""

	import torch
	import numpy as np
	import soundfile as sf
	import librosa
	from pathlib import Path
	from typing import Optional, Union, Tuple
	import warnings
	import os
	warnings.filterwarnings('ignore')

	# Set environment variable to agree to TTS license for non-commercial use
	os.environ['COQUI_TOS_AGREED'] = '1'

	# Fix PyTorch 2.6+ weights_only issue - disable weights_only for TTS models
	import torch
	# Monkey patch torch.load to use weights_only=False for compatibility
	_original_torch_load = torch.load
	def _patched_torch_load(args, *kwargs):
	kwargs.setdefault('weights_only', False)
	return _original_torch_load(args, *kwargs)
	torch.load = _patched_torch_load

	try:
	from TTS.api import TTS
	except ImportError:
	print("Warning: TTS not installed. Run: pip install TTS")
	TTS = None


	class VoiceCloner:
	"""
	Few-shot voice cloning system using XTTS v2

	Features:
	- Clone any voice with 5-30 seconds of reference audio
	- Multi-speaker support
	- Real-time inference optimized for RTX 5060 Ti
	- Mixed precision (FP16) support
	"""

	def __init__(
	self,
	model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2",
	device: str = "cuda",
	use_fp16: bool = True,
	cache_dir: Optional[str] = None
	):
	"""
	Initialize the Voice Cloner

	Args:
	model_name: TTS model name (default: XTTS v2)
	device: Device to run on ('cuda' or 'cpu')
	use_fp16: Use mixed precision for faster inference
	cache_dir: Directory to cache models
	"""
	if TTS is None:
	raise ImportError("TTS library not installed. Run: pip install TTS")

	self.device = device if torch.cuda.is_available() else "cpu"
	self.use_fp16 = use_fp16 and self.device == "cuda"

	print(f"🚀 Initializing Voice Cloner on {self.device}...")
	print(f" Model: {model_name}")
	print(f" Mixed Precision (FP16): {self.use_fp16}")

	# Initialize TTS model
	try:
	self.tts = TTS(
	model_name=model_name,
	gpu=(self.device == "cuda")
	)

	# Move to device
	if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_model'):
	self.tts.synthesizer.tts_model.to(self.device)

	# Enable FP16 if requested
	if self.use_fp16:
	self.tts.synthesizer.tts_model.half()
	print(" ✓ FP16 enabled")

	print("✓ Voice Cloner initialized successfully!")

	except Exception as e:
	print(f"❌ Error initializing TTS model: {e}")
	raise

	def clone_voice(
	self,
	text: str,
	reference_audio_path: Union[str, Path],
	language: str = "en",
	output_path: Optional[Union[str, Path]] = None,
	speed: float = 1.0
	) -> Tuple[np.ndarray, int]:
	"""
	Clone a voice and synthesize speech

	Args:
	text: Text to synthesize
	reference_audio_path: Path to reference audio (5-30s recommended)
	language: Language code ('en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko')
	output_path: Optional path to save output audio
	speed: Speech speed multiplier (default: 1.0)

	Returns:
	Tuple of (audio_array, sample_rate)
	"""
	# Validate inputs
	if not text or len(text.strip()) == 0:
	raise ValueError("Text cannot be empty")

	if len(text) > 1000:
	warnings.warn("Text is very long (>1000 chars). Consider splitting for better quality.")

	reference_audio_path = Path(reference_audio_path)
	if not reference_audio_path.exists():
	raise FileNotFoundError(f"Reference audio not found: {reference_audio_path}")

	print(f"🎤 Cloning voice from: {reference_audio_path.name}")
	print(f"📝 Text length: {len(text)} characters")
	print(f"🌍 Language: {language}")

	try:
	# Synthesize speech
	with torch.cuda.amp.autocast(enabled=self.use_fp16):
	wav = self.tts.tts(
	text=text,
	speaker_wav=str(reference_audio_path),
	language=language,
	speed=speed
	)

	# Convert to numpy array
	if isinstance(wav, torch.Tensor):
	wav = wav.cpu().numpy()
	elif isinstance(wav, list):
	wav = np.array(wav)

	# Get sample rate
	sample_rate = self.tts.synthesizer.output_sample_rate

	# Save if output path provided
	if output_path:
	self.save_audio(wav, output_path, sample_rate)
	print(f"✓ Audio saved to: {output_path}")

	print(f"✓ Synthesis complete! Duration: {len(wav)/sample_rate:.2f}s")

	return wav, sample_rate

	except Exception as e:
	print(f"❌ Error during synthesis: {e}")
	raise

	def clone_multiple_speakers(
	self,
	text: str,
	speaker_references: dict,
	language: str = "en",
	output_dir: Optional[Union[str, Path]] = None
	) -> dict:
	"""
	Synthesize the same text in multiple voices

	Args:
	text: Text to synthesize
	speaker_references: Dict mapping speaker names to reference audio paths
	language: Language code
	output_dir: Directory to save outputs

	Returns:
	Dict mapping speaker names to (audio_array, sample_rate) tuples
	"""
	results = {}

	if output_dir:
	output_dir = Path(output_dir)
	output_dir.mkdir(parents=True, exist_ok=True)

	print(f"🎭 Synthesizing for {len(speaker_references)} speakers...")

	for speaker_name, ref_path in speaker_references.items():
	print(f"\n--- Speaker: {speaker_name} ---")

	output_path = None
	if output_dir:
	output_path = output_dir / f"{speaker_name}.wav"

	try:
	wav, sr = self.clone_voice(
	text=text,
	reference_audio_path=ref_path,
	language=language,
	output_path=output_path
	)
	results[speaker_name] = (wav, sr)

	except Exception as e:
	print(f"❌ Failed for {speaker_name}: {e}")
	results[speaker_name] = None

	print(f"\n✓ Completed {len([r for r in results.values() if r is not None])}/{len(speaker_references)} speakers")
	return results

	@staticmethod
	def save_audio(
	audio: np.ndarray,
	output_path: Union[str, Path],
	sample_rate: int = 24000
	):
	"""
	Save audio to file

	Args:
	audio: Audio array
	output_path: Output file path
	sample_rate: Sample rate (default: 24000 Hz)
	"""
	output_path = Path(output_path)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	# Normalize audio to prevent clipping
	audio = np.clip(audio, -1.0, 1.0)

	sf.write(str(output_path), audio, sample_rate)

	@staticmethod
	def load_audio(
	audio_path: Union[str, Path],
	target_sr: int = 24000
	) -> Tuple[np.ndarray, int]:
	"""
	Load and resample audio

	Args:
	audio_path: Path to audio file
	target_sr: Target sample rate

	Returns:
	Tuple of (audio_array, sample_rate)
	"""
	audio, sr = librosa.load(str(audio_path), sr=target_sr)
	return audio, sr

	def get_model_info(self) -> dict:
	"""
	Get information about the loaded model

	Returns:
	Dict with model information
	"""
	info = {
	"model_name": "XTTS v2",
	"device": self.device,
	"fp16_enabled": self.use_fp16,
	"sample_rate": self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 24000,
	}

	# Get VRAM usage if on CUDA
	if self.device == "cuda":
	info["vram_allocated_gb"] = torch.cuda.memory_allocated() / 1e9
	info["vram_reserved_gb"] = torch.cuda.memory_reserved() / 1e9

	return info

	def __repr__(self):
	return f"VoiceCloner(device={self.device}, fp16={self.use_fp16})"


	def main():
	"""Demo usage of VoiceCloner"""
	print("=" * 60)
	print("Voice Cloner Demo")
	print("=" * 60)

	# Initialize
	cloner = VoiceCloner(device="cuda", use_fp16=True)

	# Print model info
	print("\n📊 Model Information:")
	info = cloner.get_model_info()
	for key, value in info.items():
	print(f" {key}: {value}")

	print("\n" + "=" * 60)
	print("Ready to clone voices!")
	print("=" * 60)


	if __name__ == "__main__":
	main()