video-dubbing

Build error

App Files Files Community

artificialguybr commited on Mar 20

Commit

461ffa8

verified ·

1 Parent(s): 5bf2e1d

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -38

app.py CHANGED Viewed

@@ -6,19 +6,40 @@ import stat
 import tempfile
 from zipfile import ZipFile
-if not os.path.exists("MuseTalk"):
-    subprocess.run(["git", "clone", "--depth", "1", "https://github.com/TMElyralab/MuseTalk.git"], check=True)
-    subprocess.run(["pip", "install", "-q", "-r", "MuseTalk/requirements.txt"], check=True)
-    subprocess.run(["mim", "install", "mmcv==2.0.1"], check=True)
-    subprocess.run(["mim", "install", "mmdet==3.1.0"], check=True)
-    subprocess.run(["mim", "install", "mmpose==1.1.0"], check=True)
 import gradio as gr
 import ffmpeg
 import torch
 import soundfile as sf
 from googletrans import Translator
-from huggingface_hub import HfApi, snapshot_download
 from qwen_tts import Qwen3TTSModel
 import spaces
@@ -28,7 +49,6 @@ except ImportError:
     from moviepy.editor import VideoFileClip
 HF_TOKEN = os.environ.get("HF_TOKEN")
-REPO_ID = "artificialguybr/video-dubbing"
 MAX_VIDEO_DURATION = 60
 api = HfApi(token=HF_TOKEN)
@@ -52,6 +72,7 @@ language_mapping = {
 TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
 tts_model = None
 def get_tts_model():
     global tts_model
     if tts_model is None:
@@ -62,13 +83,19 @@ def get_tts_model():
         )
     return tts_model
 def uid(ext=""):
     return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{ext}")
 def cleanup(*paths):
     for p in paths:
         if p and os.path.exists(p):
-            os.remove(p)
 def extract_audio_segment(video_path, duration=4.0):
     out = uid(".wav")
@@ -79,6 +106,7 @@ def extract_audio_segment(video_path, duration=4.0):
     )
     return out
 @spaces.GPU(duration=120)
 def transcribe_audio(file_path):
     temp_audio = None
@@ -114,40 +142,43 @@ def transcribe_audio(file_path):
     return result.strip()
 @spaces.GPU(duration=120)
 def synthesize_speech(translated_text, ref_audio_path, ref_text, target_language_qwen):
     model = get_tts_model()
     prompt = model.create_voice_clone_prompt(
         ref_audio=ref_audio_path,
         ref_text=ref_text,
     )
     wavs, sr = model.generate_voice_clone(
         text=translated_text,
         language=target_language_qwen,
         voice_clone_prompt=prompt,
     )
     out_path = uid(".wav")
     sf.write(out_path, wavs[0], sr)
     return out_path
-@spaces.GPU(duration=180)
-def run_musetalk(video_path, audio_path, run_uuid):
-    out_path = f"{run_uuid}_output_video.mp4"
     try:
         subprocess.run(
             [
-                "python", "MuseTalk/inference.py",
-                "--video", video_path,
                 "--audio", audio_path,
-                "--output", out_path,
             ],
             check=True, capture_output=True, text=True,
         )
-    except subprocess.CalledProcessError as e:
-        gr.Warning(f"MuseTalk failed, falling back to audio replace. Error: {e.stderr[-300:]}")
         subprocess.run(
             f"ffmpeg -y -i {video_path} -i {audio_path} -c:v copy -c:a aac "
             f"-map 0:v:0 -map 1:a:0 {out_path}",
@@ -155,7 +186,8 @@ def run_musetalk(video_path, audio_path, run_uuid):
         )
     return out_path
-def process_video(video, target_language, use_musetalk):
     if not video:
         return None, "Please upload a video."
     if target_language is None:
@@ -165,6 +197,8 @@ def process_video(video, target_language, use_musetalk):
     resized = f"/tmp/{run_uuid}_resized.mp4"
     audio_raw = f"/tmp/{run_uuid}_audio_raw.wav"
     audio_clean = f"/tmp/{run_uuid}_audio_clean.wav"
     try:
         ffmpeg.input(video).output(resized, vf="scale=-2:720").run(quiet=True, overwrite_output=True)
@@ -172,7 +206,6 @@ def process_video(video, target_language, use_musetalk):
         info = ffmpeg.probe(resized)
         duration = float(next(s for s in info["streams"] if s["codec_type"] == "video")["duration"])
         if duration > MAX_VIDEO_DURATION:
-            cleanup(resized)
             return None, f"Video exceeds {MAX_VIDEO_DURATION}s limit."
         ffmpeg.input(resized).output(audio_raw, acodec="pcm_s24le", ar=48000, map="a").run(
@@ -192,12 +225,10 @@ def process_video(video, target_language, use_musetalk):
         translated = translator.translate(transcription, dest=lang_code).text
         ref_clip = extract_audio_segment(resized, duration=4.0)
-        ref_text_short = transcription[:200]
-        synth_audio = synthesize_speech(translated, ref_clip, ref_text_short, lang_qwen)
-        if use_musetalk:
-            output_video = run_musetalk(resized, synth_audio, run_uuid)
         else:
             output_video = f"/tmp/{run_uuid}_output_video.mp4"
             subprocess.run(
@@ -209,22 +240,23 @@ def process_video(video, target_language, use_musetalk):
         if not os.path.exists(output_video):
             return None, "Output video was not generated."
-        return output_video, "Done!"
     except Exception as e:
         return None, f"Error: {e}"
     finally:
         cleanup(resized, audio_raw, audio_clean)
-        if "ref_clip" in locals():
             cleanup(ref_clip)
-        if "synth_audio" in locals():
             cleanup(synth_audio)
 with gr.Blocks() as demo:
     gr.Markdown("# 🎬 AI Video Dubbing")
     gr.Markdown(
-        "Upload a video, pick a target language, and get a dubbed version with the **original speaker's cloned voice** "
-        "powered by Qwen3-TTS + optional MuseTalk lip sync."
     )
     with gr.Row():
@@ -235,10 +267,10 @@ with gr.Blocks() as demo:
                 label="Target Language",
                 value="English",
             )
-            use_musetalk = gr.Checkbox(
-                label="Lip Sync with MuseTalk",
                 value=False,
-                info="Recommended for close-up face videos. Adds processing time.",
             )
             submit_button = gr.Button("🚀 Dub Video", variant="primary")
@@ -248,14 +280,14 @@ with gr.Blocks() as demo:
     submit_button.click(
         process_video,
-        inputs=[video_input, target_language, use_musetalk],
         outputs=[output_video, status],
     )
     gr.Markdown("""
     ---
-    **Pipeline:** Whisper large-v3-turbo → Google Translate → Qwen3-TTS (voice clone) → MuseTalk (optional)
-    Developed by [@artificialguybr](https://twitter.com/artificialguybr)
     """)
 demo.queue()

 import tempfile
 from zipfile import ZipFile
+def _setup_wav2lip():
+    if not os.path.exists("Wav2Lip"):
+        subprocess.run(
+            ["git", "clone", "--depth", "1", "https://github.com/Rudrabha/Wav2Lip.git"],
+            check=True,
+        )
+        subprocess.run(
+            ["pip", "install", "-q", "--no-deps",
+             "basicsr", "facexlib", "gfpgan", "batch-face"],
+            check=True,
+        )
+    ckpt_dir = "Wav2Lip/checkpoints"
+    ckpt_path = f"{ckpt_dir}/wav2lip_gan.pth"
+    if not os.path.exists(ckpt_path):
+        os.makedirs(ckpt_dir, exist_ok=True)
+        subprocess.run(
+            [
+                "wget", "-q",
+                "https://huggingface.co/camenduru/Wav2Lip/resolve/main/wav2lip_gan.pth",
+                "-O", ckpt_path,
+            ],
+            check=True,
+        )
+_setup_wav2lip()
 import gradio as gr
 import ffmpeg
 import torch
 import soundfile as sf
 from googletrans import Translator
+from huggingface_hub import HfApi
 from qwen_tts import Qwen3TTSModel
 import spaces
     from moviepy.editor import VideoFileClip
 HF_TOKEN = os.environ.get("HF_TOKEN")
 MAX_VIDEO_DURATION = 60
 api = HfApi(token=HF_TOKEN)
 TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
 tts_model = None
 def get_tts_model():
     global tts_model
     if tts_model is None:
         )
     return tts_model
 def uid(ext=""):
     return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{ext}")
 def cleanup(*paths):
     for p in paths:
         if p and os.path.exists(p):
+            try:
+                os.remove(p)
+            except OSError:
+                pass
 def extract_audio_segment(video_path, duration=4.0):
     out = uid(".wav")
     )
     return out
 @spaces.GPU(duration=120)
 def transcribe_audio(file_path):
     temp_audio = None
     return result.strip()
 @spaces.GPU(duration=120)
 def synthesize_speech(translated_text, ref_audio_path, ref_text, target_language_qwen):
     model = get_tts_model()
     prompt = model.create_voice_clone_prompt(
         ref_audio=ref_audio_path,
         ref_text=ref_text,
     )
     wavs, sr = model.generate_voice_clone(
         text=translated_text,
         language=target_language_qwen,
         voice_clone_prompt=prompt,
     )
     out_path = uid(".wav")
     sf.write(out_path, wavs[0], sr)
     return out_path
+@spaces.GPU(duration=120)
+def run_wav2lip(video_path, audio_path, run_uuid):
+    out_path = f"/tmp/{run_uuid}_output_video.mp4"
     try:
         subprocess.run(
             [
+                "python", "Wav2Lip/inference.py",
+                "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
+                "--face", video_path,
                 "--audio", audio_path,
+                "--pads", "0", "15", "0", "0",
+                "--resize_factor", "1",
+                "--nosmooth",
+                "--outfile", out_path,
             ],
             check=True, capture_output=True, text=True,
         )
+    except subprocess.CalledProcessError:
+        gr.Warning("Wav2Lip failed, falling back to simple audio replace.")
         subprocess.run(
             f"ffmpeg -y -i {video_path} -i {audio_path} -c:v copy -c:a aac "
             f"-map 0:v:0 -map 1:a:0 {out_path}",
         )
     return out_path
+def process_video(video, target_language, use_wav2lip):
     if not video:
         return None, "Please upload a video."
     if target_language is None:
     resized = f"/tmp/{run_uuid}_resized.mp4"
     audio_raw = f"/tmp/{run_uuid}_audio_raw.wav"
     audio_clean = f"/tmp/{run_uuid}_audio_clean.wav"
+    ref_clip = None
+    synth_audio = None
     try:
         ffmpeg.input(video).output(resized, vf="scale=-2:720").run(quiet=True, overwrite_output=True)
         info = ffmpeg.probe(resized)
         duration = float(next(s for s in info["streams"] if s["codec_type"] == "video")["duration"])
         if duration > MAX_VIDEO_DURATION:
             return None, f"Video exceeds {MAX_VIDEO_DURATION}s limit."
         ffmpeg.input(resized).output(audio_raw, acodec="pcm_s24le", ar=48000, map="a").run(
         translated = translator.translate(transcription, dest=lang_code).text
         ref_clip = extract_audio_segment(resized, duration=4.0)
+        synth_audio = synthesize_speech(translated, ref_clip, transcription[:200], lang_qwen)
+        if use_wav2lip:
+            output_video = run_wav2lip(resized, synth_audio, run_uuid)
         else:
             output_video = f"/tmp/{run_uuid}_output_video.mp4"
             subprocess.run(
         if not os.path.exists(output_video):
             return None, "Output video was not generated."
+        return output_video, "✅ Done!"
     except Exception as e:
         return None, f"Error: {e}"
     finally:
         cleanup(resized, audio_raw, audio_clean)
+        if ref_clip:
             cleanup(ref_clip)
+        if synth_audio:
             cleanup(synth_audio)
 with gr.Blocks() as demo:
     gr.Markdown("# 🎬 AI Video Dubbing")
     gr.Markdown(
+        "Upload a video, pick a target language, and get a dubbed version with the "
+        "**original speaker's cloned voice** — Whisper + Qwen3-TTS + Wav2Lip."
     )
     with gr.Row():
                 label="Target Language",
                 value="English",
             )
+            use_wav2lip = gr.Checkbox(
+                label="Lip Sync with Wav2Lip",
                 value=False,
+                info="Recommended for close-up face videos. Adds ~30s processing time.",
             )
             submit_button = gr.Button("🚀 Dub Video", variant="primary")
     submit_button.click(
         process_video,
+        inputs=[video_input, target_language, use_wav2lip],
         outputs=[output_video, status],
     )
     gr.Markdown("""
     ---
+    **Pipeline:** Whisper large-v3-turbo → Google Translate → Qwen3-TTS voice clone → Wav2Lip (optional)
+    By [@artificialguybr](https://twitter.com/artificialguybr)
     """)
 demo.queue()