import gradio as gr import uuid import librosa import soundfile as sf import subprocess import os from inference import run_wav2lip from huggingface_hub import hf_hub_download MODEL_PATH = "models/wav2lip.pth" # Create necessary directories os.makedirs("models", exist_ok=True) os.makedirs("temp", exist_ok=True) os.makedirs("results", exist_ok=True) if not os.path.exists(MODEL_PATH): print("Downloading Wav2Lip model...") hf_hub_download( repo_id="numz/wav2lip_studio", filename="Wav2lip/wav2lip.pth", local_dir="models", local_dir_use_symlinks=False ) def normalize_audio(inp, out): y, sr = librosa.load(inp, sr=16000) sf.write(out, y, 16000) def tts(text, out): subprocess.run( ["espeak", "-w", out, text], check=True ) def generate(face, text, audio): uid = str(uuid.uuid4()) face_path = f"{uid}.png" audio_path = f"{uid}.wav" out_video = f"{uid}.mp4" try: face.save(face_path) if audio is None: if not text or text.strip() == "": raise gr.Error("Provide text or audio.") tts(text, audio_path) else: normalize_audio(audio, audio_path) run_wav2lip(face_path, audio_path, out_video) # Check if output file exists if not os.path.exists(out_video): raise gr.Error("Video generation failed. Check temp/result.avi was created properly.") return out_video except Exception as e: # Clean up temporary files for f in [face_path, audio_path]: if os.path.exists(f): os.remove(f) raise gr.Error(f"Error during generation: {str(e)}") demo = gr.Interface( fn=generate, inputs=[ gr.Image(type="pil", label="Face Image"), gr.Textbox(label="Text (used if no audio)", lines=2), gr.Audio(type="filepath", label="Audio (optional, WAV preferred)") ], outputs=gr.Video(label="Talking Avatar"), title="Open-Source AI Talking Avatar (Wav2Lip SD-GAN)", description="Upload a face image and text or audio. Generates a lip-synced avatar video." ) demo.launch()