TTS-with-VoiceCloning / src /voice_cloner.py
saadmannan's picture
initial commit
5ffccae
"""
Voice Cloner Module
Main API for few-shot voice cloning using XTTS v2
"""
import torch
import numpy as np
import soundfile as sf
import librosa
from pathlib import Path
from typing import Optional, Union, Tuple
import warnings
import os
warnings.filterwarnings('ignore')
# Set environment variable to agree to TTS license for non-commercial use
os.environ['COQUI_TOS_AGREED'] = '1'
# Fix PyTorch 2.6+ weights_only issue - disable weights_only for TTS models
import torch
# Monkey patch torch.load to use weights_only=False for compatibility
_original_torch_load = torch.load
def _patched_torch_load(*args, **kwargs):
kwargs.setdefault('weights_only', False)
return _original_torch_load(*args, **kwargs)
torch.load = _patched_torch_load
try:
from TTS.api import TTS
except ImportError:
print("Warning: TTS not installed. Run: pip install TTS")
TTS = None
class VoiceCloner:
"""
Few-shot voice cloning system using XTTS v2
Features:
- Clone any voice with 5-30 seconds of reference audio
- Multi-speaker support
- Real-time inference optimized for RTX 5060 Ti
- Mixed precision (FP16) support
"""
def __init__(
self,
model_name: str = "tts_models/multilingual/multi-dataset/xtts_v2",
device: str = "cuda",
use_fp16: bool = True,
cache_dir: Optional[str] = None
):
"""
Initialize the Voice Cloner
Args:
model_name: TTS model name (default: XTTS v2)
device: Device to run on ('cuda' or 'cpu')
use_fp16: Use mixed precision for faster inference
cache_dir: Directory to cache models
"""
if TTS is None:
raise ImportError("TTS library not installed. Run: pip install TTS")
self.device = device if torch.cuda.is_available() else "cpu"
self.use_fp16 = use_fp16 and self.device == "cuda"
print(f"🚀 Initializing Voice Cloner on {self.device}...")
print(f" Model: {model_name}")
print(f" Mixed Precision (FP16): {self.use_fp16}")
# Initialize TTS model
try:
self.tts = TTS(
model_name=model_name,
gpu=(self.device == "cuda")
)
# Move to device
if hasattr(self.tts, 'synthesizer') and hasattr(self.tts.synthesizer, 'tts_model'):
self.tts.synthesizer.tts_model.to(self.device)
# Enable FP16 if requested
if self.use_fp16:
self.tts.synthesizer.tts_model.half()
print(" ✓ FP16 enabled")
print("✓ Voice Cloner initialized successfully!")
except Exception as e:
print(f"❌ Error initializing TTS model: {e}")
raise
def clone_voice(
self,
text: str,
reference_audio_path: Union[str, Path],
language: str = "en",
output_path: Optional[Union[str, Path]] = None,
speed: float = 1.0
) -> Tuple[np.ndarray, int]:
"""
Clone a voice and synthesize speech
Args:
text: Text to synthesize
reference_audio_path: Path to reference audio (5-30s recommended)
language: Language code ('en', 'es', 'fr', 'de', 'it', 'pt', 'pl', 'tr', 'ru', 'nl', 'cs', 'ar', 'zh-cn', 'ja', 'hu', 'ko')
output_path: Optional path to save output audio
speed: Speech speed multiplier (default: 1.0)
Returns:
Tuple of (audio_array, sample_rate)
"""
# Validate inputs
if not text or len(text.strip()) == 0:
raise ValueError("Text cannot be empty")
if len(text) > 1000:
warnings.warn("Text is very long (>1000 chars). Consider splitting for better quality.")
reference_audio_path = Path(reference_audio_path)
if not reference_audio_path.exists():
raise FileNotFoundError(f"Reference audio not found: {reference_audio_path}")
print(f"🎤 Cloning voice from: {reference_audio_path.name}")
print(f"📝 Text length: {len(text)} characters")
print(f"🌍 Language: {language}")
try:
# Synthesize speech
with torch.cuda.amp.autocast(enabled=self.use_fp16):
wav = self.tts.tts(
text=text,
speaker_wav=str(reference_audio_path),
language=language,
speed=speed
)
# Convert to numpy array
if isinstance(wav, torch.Tensor):
wav = wav.cpu().numpy()
elif isinstance(wav, list):
wav = np.array(wav)
# Get sample rate
sample_rate = self.tts.synthesizer.output_sample_rate
# Save if output path provided
if output_path:
self.save_audio(wav, output_path, sample_rate)
print(f"✓ Audio saved to: {output_path}")
print(f"✓ Synthesis complete! Duration: {len(wav)/sample_rate:.2f}s")
return wav, sample_rate
except Exception as e:
print(f"❌ Error during synthesis: {e}")
raise
def clone_multiple_speakers(
self,
text: str,
speaker_references: dict,
language: str = "en",
output_dir: Optional[Union[str, Path]] = None
) -> dict:
"""
Synthesize the same text in multiple voices
Args:
text: Text to synthesize
speaker_references: Dict mapping speaker names to reference audio paths
language: Language code
output_dir: Directory to save outputs
Returns:
Dict mapping speaker names to (audio_array, sample_rate) tuples
"""
results = {}
if output_dir:
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
print(f"🎭 Synthesizing for {len(speaker_references)} speakers...")
for speaker_name, ref_path in speaker_references.items():
print(f"\n--- Speaker: {speaker_name} ---")
output_path = None
if output_dir:
output_path = output_dir / f"{speaker_name}.wav"
try:
wav, sr = self.clone_voice(
text=text,
reference_audio_path=ref_path,
language=language,
output_path=output_path
)
results[speaker_name] = (wav, sr)
except Exception as e:
print(f"❌ Failed for {speaker_name}: {e}")
results[speaker_name] = None
print(f"\n✓ Completed {len([r for r in results.values() if r is not None])}/{len(speaker_references)} speakers")
return results
@staticmethod
def save_audio(
audio: np.ndarray,
output_path: Union[str, Path],
sample_rate: int = 24000
):
"""
Save audio to file
Args:
audio: Audio array
output_path: Output file path
sample_rate: Sample rate (default: 24000 Hz)
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Normalize audio to prevent clipping
audio = np.clip(audio, -1.0, 1.0)
sf.write(str(output_path), audio, sample_rate)
@staticmethod
def load_audio(
audio_path: Union[str, Path],
target_sr: int = 24000
) -> Tuple[np.ndarray, int]:
"""
Load and resample audio
Args:
audio_path: Path to audio file
target_sr: Target sample rate
Returns:
Tuple of (audio_array, sample_rate)
"""
audio, sr = librosa.load(str(audio_path), sr=target_sr)
return audio, sr
def get_model_info(self) -> dict:
"""
Get information about the loaded model
Returns:
Dict with model information
"""
info = {
"model_name": "XTTS v2",
"device": self.device,
"fp16_enabled": self.use_fp16,
"sample_rate": self.tts.synthesizer.output_sample_rate if hasattr(self.tts, 'synthesizer') else 24000,
}
# Get VRAM usage if on CUDA
if self.device == "cuda":
info["vram_allocated_gb"] = torch.cuda.memory_allocated() / 1e9
info["vram_reserved_gb"] = torch.cuda.memory_reserved() / 1e9
return info
def __repr__(self):
return f"VoiceCloner(device={self.device}, fp16={self.use_fp16})"
def main():
"""Demo usage of VoiceCloner"""
print("=" * 60)
print("Voice Cloner Demo")
print("=" * 60)
# Initialize
cloner = VoiceCloner(device="cuda", use_fp16=True)
# Print model info
print("\n📊 Model Information:")
info = cloner.get_model_info()
for key, value in info.items():
print(f" {key}: {value}")
print("\n" + "=" * 60)
print("Ready to clone voices!")
print("=" * 60)
if __name__ == "__main__":
main()