import os import uuid import asyncio import subprocess import shutil import nest_asyncio import gradio as gr import edge_tts from deep_translator import GoogleTranslator from faster_whisper import WhisperModel # Allow asyncio to run inside Gradio's existing event loop nest_asyncio.apply() # Load Whisper model once at startup # small = good balance between speed and accuracy on CPU # int8 = quantized for lower memory usage model = WhisperModel("small", device="cpu", compute_type="int8") # Supported target languages # Format: "Display Name": ("translation_code", "edge_tts_voice_name") languages = { "English": ("en", "en-US-EricNeural"), "Spanish": ("es", "es-ES-AlvaroNeural"), "French": ("fr", "fr-FR-HenriNeural"), "German": ("de", "de-DE-ConradNeural"), "Italian": ("it", "it-IT-DiegoNeural"), "Russian": ("ru", "ru-RU-DmitryNeural"), } def transcribe(audio): """ Transcribe audio file to text using faster-whisper. Returns a single string with all segments joined. """ segments, _ = model.transcribe(audio) text = "" for s in segments: text += s.text + " " return text.strip() async def tts_async(text, voice, out): """ Async function to generate speech from text using Microsoft Edge TTS. Saves the result to the given output file path. """ t = edge_tts.Communicate(text, voice) await t.save(out) def run_tts(text, voice, out): """ Wrapper to run the async TTS function synchronously inside the existing asyncio event loop (required for Gradio). """ loop = asyncio.get_event_loop() loop.run_until_complete(tts_async(text, voice, out)) def process(video, language, use_lipsync): """ Main video dubbing pipeline: Step 1 - Resize: scale video to 480p for faster processing Step 2 - Extract audio: pull mono 16kHz WAV from video (Whisper format) Step 3 - Transcribe: convert audio to text using Whisper Step 4 - Translate: translate text to target language using Google Translate Step 5 - TTS: generate new speech audio using Edge TTS Step 6 - Combine: - If lip sync enabled: run Wav2Lip to animate mouth movements - If Wav2Lip fails: fallback to simple audio replacement - If lip sync disabled: directly replace audio track with TTS audio Returns: (output_video_path, status_message) """ try: # gr.Video returns the file path directly as a string video_path = video # Create an isolated temp directory for this job # Using short UUID to avoid path collisions between concurrent users uid = uuid.uuid4().hex[:6] work_dir = f"/tmp/{uid}" os.makedirs(work_dir, exist_ok=True) # Copy uploaded video into our work directory input_video = os.path.join(work_dir, "input.mp4") shutil.copy(video_path, input_video) # ------------------------------------------------------------------- # Step 1: Resize video to 480p # -vf scale=-2:480 keeps aspect ratio, height = 480px # Smaller resolution = faster Whisper transcription and Wav2Lip # ------------------------------------------------------------------- resized = os.path.join(work_dir, "video.mp4") subprocess.run( ["ffmpeg", "-y", "-i", input_video, "-vf", "scale=-2:480", resized], check=True, ) # ------------------------------------------------------------------- # Step 2: Extract audio track from resized video # -vn = no video, -ac 1 = mono, -ar 16000 = 16kHz sample rate # 16kHz mono WAV is the required input format for Whisper # ------------------------------------------------------------------- audio = os.path.join(work_dir, "audio.wav") subprocess.run( ["ffmpeg", "-y", "-i", resized, "-vn", "-ac", "1", "-ar", "16000", audio], check=True, ) # ------------------------------------------------------------------- # Step 3: Transcribe audio to text using Whisper # ------------------------------------------------------------------- text = transcribe(audio) if not text: return None, "❌ Transcription failed or audio is silent." # ------------------------------------------------------------------- # Step 4: Translate transcribed text to the target language # source="auto" = Whisper auto-detects the original language # ------------------------------------------------------------------- lang, voice = languages[language] translated = GoogleTranslator(source="auto", target=lang).translate(text) if not translated: return None, "❌ Translation failed." # ------------------------------------------------------------------- # Step 5: Generate TTS speech from translated text # Edge TTS uses Microsoft neural voices (free, no API key needed) # ------------------------------------------------------------------- speech = os.path.join(work_dir, "tts.wav") run_tts(translated, voice, speech) # Output file path for final video output = os.path.join(work_dir, "lipsync.mp4") # ------------------------------------------------------------------- # Step 6a: Lip sync mode — run Wav2Lip to animate mouth movements # Wav2Lip requires: face video + audio -> outputs lip-synced video # ------------------------------------------------------------------- if use_lipsync: result = subprocess.run( [ "python", "Wav2Lip/inference.py", "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth", "--face", resized, # input face video "--audio", speech, # new TTS audio "--outfile", output, # output lip-synced video ], capture_output=True, text=True, ) # If Wav2Lip failed for any reason, fall back to simple audio swap if result.returncode != 0: print(f"WAV2LIP STDERR: {result.stderr}") print(f"WAV2LIP STDOUT: {result.stdout}") # Fallback: copy video stream, replace audio stream subprocess.run( f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac " f"-map 0:v:0 -map 1:a:0 {output}", shell=True, check=True, ) return output, f"⚠️ Wav2Lip failed, used audio replacement instead.\n{result.stderr}" return output, "✅ Done with lip sync!" # ------------------------------------------------------------------- # Step 6b: No lip sync — just replace the audio track # -c:v copy = keep original video stream unchanged # -c:a aac = encode new audio as AAC # -map 0:v:0 = take video from first input # -map 1:a:0 = take audio from second input (TTS) # ------------------------------------------------------------------- else: subprocess.run( f"ffmpeg -y -i {resized} -i {speech} -c:v copy -c:a aac " f"-map 0:v:0 -map 1:a:0 {output}", shell=True, check=True, ) return output, "✅ Done! (audio replacement, no lip sync)" except Exception as e: # Catch any unexpected errors and return them as status message return None, f"❌ Error: {str(e)}" # --------------------------------------------------------------------------- # Gradio UI # --------------------------------------------------------------------------- with gr.Blocks() as demo: gr.Markdown("# 🎬 AI Video Dubbing + Lip Sync") with gr.Row(): with gr.Column(): # Video upload widget — shows preview before processing video = gr.Video(label="Upload Video") # Target language selector lang = gr.Dropdown( list(languages.keys()), value="Spanish", label="Target Language", ) # Toggle to enable/disable Wav2Lip lip sync # Disabled by default — faster, works on all videos # Enable only if video has close-up face shots use_lipsync = gr.Checkbox( label="Enable Lip Sync (Wav2Lip)", value=False, info="Enable if video has close-up face. Slower processing.", ) # Submit button run = gr.Button("▶ Process", variant="primary") with gr.Column(): # Output video player out = gr.Video(label="Result") # Status/error message box status = gr.Textbox(label="Status", lines=3) # Wire up the button click to the process function run.click(process, inputs=[video, lang, use_lipsync], outputs=[out, status]) demo.queue() demo.launch()