artificialguybr commited on
Commit
461ffa8
·
verified ·
1 Parent(s): 5bf2e1d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -38
app.py CHANGED
@@ -6,19 +6,40 @@ import stat
6
  import tempfile
7
  from zipfile import ZipFile
8
 
9
- if not os.path.exists("MuseTalk"):
10
- subprocess.run(["git", "clone", "--depth", "1", "https://github.com/TMElyralab/MuseTalk.git"], check=True)
11
- subprocess.run(["pip", "install", "-q", "-r", "MuseTalk/requirements.txt"], check=True)
12
- subprocess.run(["mim", "install", "mmcv==2.0.1"], check=True)
13
- subprocess.run(["mim", "install", "mmdet==3.1.0"], check=True)
14
- subprocess.run(["mim", "install", "mmpose==1.1.0"], check=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  import gradio as gr
17
  import ffmpeg
18
  import torch
19
  import soundfile as sf
20
  from googletrans import Translator
21
- from huggingface_hub import HfApi, snapshot_download
22
  from qwen_tts import Qwen3TTSModel
23
  import spaces
24
 
@@ -28,7 +49,6 @@ except ImportError:
28
  from moviepy.editor import VideoFileClip
29
 
30
  HF_TOKEN = os.environ.get("HF_TOKEN")
31
- REPO_ID = "artificialguybr/video-dubbing"
32
  MAX_VIDEO_DURATION = 60
33
 
34
  api = HfApi(token=HF_TOKEN)
@@ -52,6 +72,7 @@ language_mapping = {
52
  TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
53
  tts_model = None
54
 
 
55
  def get_tts_model():
56
  global tts_model
57
  if tts_model is None:
@@ -62,13 +83,19 @@ def get_tts_model():
62
  )
63
  return tts_model
64
 
 
65
  def uid(ext=""):
66
  return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{ext}")
67
 
 
68
  def cleanup(*paths):
69
  for p in paths:
70
  if p and os.path.exists(p):
71
- os.remove(p)
 
 
 
 
72
 
73
  def extract_audio_segment(video_path, duration=4.0):
74
  out = uid(".wav")
@@ -79,6 +106,7 @@ def extract_audio_segment(video_path, duration=4.0):
79
  )
80
  return out
81
 
 
82
  @spaces.GPU(duration=120)
83
  def transcribe_audio(file_path):
84
  temp_audio = None
@@ -114,40 +142,43 @@ def transcribe_audio(file_path):
114
 
115
  return result.strip()
116
 
 
117
  @spaces.GPU(duration=120)
118
  def synthesize_speech(translated_text, ref_audio_path, ref_text, target_language_qwen):
119
  model = get_tts_model()
120
-
121
  prompt = model.create_voice_clone_prompt(
122
  ref_audio=ref_audio_path,
123
  ref_text=ref_text,
124
  )
125
-
126
  wavs, sr = model.generate_voice_clone(
127
  text=translated_text,
128
  language=target_language_qwen,
129
  voice_clone_prompt=prompt,
130
  )
131
-
132
  out_path = uid(".wav")
133
  sf.write(out_path, wavs[0], sr)
134
  return out_path
135
 
136
- @spaces.GPU(duration=180)
137
- def run_musetalk(video_path, audio_path, run_uuid):
138
- out_path = f"{run_uuid}_output_video.mp4"
 
139
  try:
140
  subprocess.run(
141
  [
142
- "python", "MuseTalk/inference.py",
143
- "--video", video_path,
 
144
  "--audio", audio_path,
145
- "--output", out_path,
 
 
 
146
  ],
147
  check=True, capture_output=True, text=True,
148
  )
149
- except subprocess.CalledProcessError as e:
150
- gr.Warning(f"MuseTalk failed, falling back to audio replace. Error: {e.stderr[-300:]}")
151
  subprocess.run(
152
  f"ffmpeg -y -i {video_path} -i {audio_path} -c:v copy -c:a aac "
153
  f"-map 0:v:0 -map 1:a:0 {out_path}",
@@ -155,7 +186,8 @@ def run_musetalk(video_path, audio_path, run_uuid):
155
  )
156
  return out_path
157
 
158
- def process_video(video, target_language, use_musetalk):
 
159
  if not video:
160
  return None, "Please upload a video."
161
  if target_language is None:
@@ -165,6 +197,8 @@ def process_video(video, target_language, use_musetalk):
165
  resized = f"/tmp/{run_uuid}_resized.mp4"
166
  audio_raw = f"/tmp/{run_uuid}_audio_raw.wav"
167
  audio_clean = f"/tmp/{run_uuid}_audio_clean.wav"
 
 
168
 
169
  try:
170
  ffmpeg.input(video).output(resized, vf="scale=-2:720").run(quiet=True, overwrite_output=True)
@@ -172,7 +206,6 @@ def process_video(video, target_language, use_musetalk):
172
  info = ffmpeg.probe(resized)
173
  duration = float(next(s for s in info["streams"] if s["codec_type"] == "video")["duration"])
174
  if duration > MAX_VIDEO_DURATION:
175
- cleanup(resized)
176
  return None, f"Video exceeds {MAX_VIDEO_DURATION}s limit."
177
 
178
  ffmpeg.input(resized).output(audio_raw, acodec="pcm_s24le", ar=48000, map="a").run(
@@ -192,12 +225,10 @@ def process_video(video, target_language, use_musetalk):
192
  translated = translator.translate(transcription, dest=lang_code).text
193
 
194
  ref_clip = extract_audio_segment(resized, duration=4.0)
195
- ref_text_short = transcription[:200]
196
-
197
- synth_audio = synthesize_speech(translated, ref_clip, ref_text_short, lang_qwen)
198
 
199
- if use_musetalk:
200
- output_video = run_musetalk(resized, synth_audio, run_uuid)
201
  else:
202
  output_video = f"/tmp/{run_uuid}_output_video.mp4"
203
  subprocess.run(
@@ -209,22 +240,23 @@ def process_video(video, target_language, use_musetalk):
209
  if not os.path.exists(output_video):
210
  return None, "Output video was not generated."
211
 
212
- return output_video, "Done!"
213
 
214
  except Exception as e:
215
  return None, f"Error: {e}"
216
  finally:
217
  cleanup(resized, audio_raw, audio_clean)
218
- if "ref_clip" in locals():
219
  cleanup(ref_clip)
220
- if "synth_audio" in locals():
221
  cleanup(synth_audio)
222
 
 
223
  with gr.Blocks() as demo:
224
  gr.Markdown("# 🎬 AI Video Dubbing")
225
  gr.Markdown(
226
- "Upload a video, pick a target language, and get a dubbed version with the **original speaker's cloned voice** "
227
- "powered by Qwen3-TTS + optional MuseTalk lip sync."
228
  )
229
 
230
  with gr.Row():
@@ -235,10 +267,10 @@ with gr.Blocks() as demo:
235
  label="Target Language",
236
  value="English",
237
  )
238
- use_musetalk = gr.Checkbox(
239
- label="Lip Sync with MuseTalk",
240
  value=False,
241
- info="Recommended for close-up face videos. Adds processing time.",
242
  )
243
  submit_button = gr.Button("🚀 Dub Video", variant="primary")
244
 
@@ -248,14 +280,14 @@ with gr.Blocks() as demo:
248
 
249
  submit_button.click(
250
  process_video,
251
- inputs=[video_input, target_language, use_musetalk],
252
  outputs=[output_video, status],
253
  )
254
 
255
  gr.Markdown("""
256
  ---
257
- **Pipeline:** Whisper large-v3-turbo → Google Translate → Qwen3-TTS (voice clone)MuseTalk (optional)
258
- Developed by [@artificialguybr](https://twitter.com/artificialguybr)
259
  """)
260
 
261
  demo.queue()
 
6
  import tempfile
7
  from zipfile import ZipFile
8
 
9
+
10
+ def _setup_wav2lip():
11
+ if not os.path.exists("Wav2Lip"):
12
+ subprocess.run(
13
+ ["git", "clone", "--depth", "1", "https://github.com/Rudrabha/Wav2Lip.git"],
14
+ check=True,
15
+ )
16
+ subprocess.run(
17
+ ["pip", "install", "-q", "--no-deps",
18
+ "basicsr", "facexlib", "gfpgan", "batch-face"],
19
+ check=True,
20
+ )
21
+ ckpt_dir = "Wav2Lip/checkpoints"
22
+ ckpt_path = f"{ckpt_dir}/wav2lip_gan.pth"
23
+ if not os.path.exists(ckpt_path):
24
+ os.makedirs(ckpt_dir, exist_ok=True)
25
+ subprocess.run(
26
+ [
27
+ "wget", "-q",
28
+ "https://huggingface.co/camenduru/Wav2Lip/resolve/main/wav2lip_gan.pth",
29
+ "-O", ckpt_path,
30
+ ],
31
+ check=True,
32
+ )
33
+
34
+
35
+ _setup_wav2lip()
36
 
37
  import gradio as gr
38
  import ffmpeg
39
  import torch
40
  import soundfile as sf
41
  from googletrans import Translator
42
+ from huggingface_hub import HfApi
43
  from qwen_tts import Qwen3TTSModel
44
  import spaces
45
 
 
49
  from moviepy.editor import VideoFileClip
50
 
51
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
52
  MAX_VIDEO_DURATION = 60
53
 
54
  api = HfApi(token=HF_TOKEN)
 
72
  TTS_MODEL_ID = "Qwen/Qwen3-TTS-12Hz-1.7B-Base"
73
  tts_model = None
74
 
75
+
76
  def get_tts_model():
77
  global tts_model
78
  if tts_model is None:
 
83
  )
84
  return tts_model
85
 
86
+
87
  def uid(ext=""):
88
  return os.path.join(tempfile.gettempdir(), f"{uuid.uuid4().hex}{ext}")
89
 
90
+
91
  def cleanup(*paths):
92
  for p in paths:
93
  if p and os.path.exists(p):
94
+ try:
95
+ os.remove(p)
96
+ except OSError:
97
+ pass
98
+
99
 
100
  def extract_audio_segment(video_path, duration=4.0):
101
  out = uid(".wav")
 
106
  )
107
  return out
108
 
109
+
110
  @spaces.GPU(duration=120)
111
  def transcribe_audio(file_path):
112
  temp_audio = None
 
142
 
143
  return result.strip()
144
 
145
+
146
  @spaces.GPU(duration=120)
147
  def synthesize_speech(translated_text, ref_audio_path, ref_text, target_language_qwen):
148
  model = get_tts_model()
 
149
  prompt = model.create_voice_clone_prompt(
150
  ref_audio=ref_audio_path,
151
  ref_text=ref_text,
152
  )
 
153
  wavs, sr = model.generate_voice_clone(
154
  text=translated_text,
155
  language=target_language_qwen,
156
  voice_clone_prompt=prompt,
157
  )
 
158
  out_path = uid(".wav")
159
  sf.write(out_path, wavs[0], sr)
160
  return out_path
161
 
162
+
163
+ @spaces.GPU(duration=120)
164
+ def run_wav2lip(video_path, audio_path, run_uuid):
165
+ out_path = f"/tmp/{run_uuid}_output_video.mp4"
166
  try:
167
  subprocess.run(
168
  [
169
+ "python", "Wav2Lip/inference.py",
170
+ "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
171
+ "--face", video_path,
172
  "--audio", audio_path,
173
+ "--pads", "0", "15", "0", "0",
174
+ "--resize_factor", "1",
175
+ "--nosmooth",
176
+ "--outfile", out_path,
177
  ],
178
  check=True, capture_output=True, text=True,
179
  )
180
+ except subprocess.CalledProcessError:
181
+ gr.Warning("Wav2Lip failed, falling back to simple audio replace.")
182
  subprocess.run(
183
  f"ffmpeg -y -i {video_path} -i {audio_path} -c:v copy -c:a aac "
184
  f"-map 0:v:0 -map 1:a:0 {out_path}",
 
186
  )
187
  return out_path
188
 
189
+
190
+ def process_video(video, target_language, use_wav2lip):
191
  if not video:
192
  return None, "Please upload a video."
193
  if target_language is None:
 
197
  resized = f"/tmp/{run_uuid}_resized.mp4"
198
  audio_raw = f"/tmp/{run_uuid}_audio_raw.wav"
199
  audio_clean = f"/tmp/{run_uuid}_audio_clean.wav"
200
+ ref_clip = None
201
+ synth_audio = None
202
 
203
  try:
204
  ffmpeg.input(video).output(resized, vf="scale=-2:720").run(quiet=True, overwrite_output=True)
 
206
  info = ffmpeg.probe(resized)
207
  duration = float(next(s for s in info["streams"] if s["codec_type"] == "video")["duration"])
208
  if duration > MAX_VIDEO_DURATION:
 
209
  return None, f"Video exceeds {MAX_VIDEO_DURATION}s limit."
210
 
211
  ffmpeg.input(resized).output(audio_raw, acodec="pcm_s24le", ar=48000, map="a").run(
 
225
  translated = translator.translate(transcription, dest=lang_code).text
226
 
227
  ref_clip = extract_audio_segment(resized, duration=4.0)
228
+ synth_audio = synthesize_speech(translated, ref_clip, transcription[:200], lang_qwen)
 
 
229
 
230
+ if use_wav2lip:
231
+ output_video = run_wav2lip(resized, synth_audio, run_uuid)
232
  else:
233
  output_video = f"/tmp/{run_uuid}_output_video.mp4"
234
  subprocess.run(
 
240
  if not os.path.exists(output_video):
241
  return None, "Output video was not generated."
242
 
243
+ return output_video, "Done!"
244
 
245
  except Exception as e:
246
  return None, f"Error: {e}"
247
  finally:
248
  cleanup(resized, audio_raw, audio_clean)
249
+ if ref_clip:
250
  cleanup(ref_clip)
251
+ if synth_audio:
252
  cleanup(synth_audio)
253
 
254
+
255
  with gr.Blocks() as demo:
256
  gr.Markdown("# 🎬 AI Video Dubbing")
257
  gr.Markdown(
258
+ "Upload a video, pick a target language, and get a dubbed version with the "
259
+ "**original speaker's cloned voice** — Whisper + Qwen3-TTS + Wav2Lip."
260
  )
261
 
262
  with gr.Row():
 
267
  label="Target Language",
268
  value="English",
269
  )
270
+ use_wav2lip = gr.Checkbox(
271
+ label="Lip Sync with Wav2Lip",
272
  value=False,
273
+ info="Recommended for close-up face videos. Adds ~30s processing time.",
274
  )
275
  submit_button = gr.Button("🚀 Dub Video", variant="primary")
276
 
 
280
 
281
  submit_button.click(
282
  process_video,
283
+ inputs=[video_input, target_language, use_wav2lip],
284
  outputs=[output_video, status],
285
  )
286
 
287
  gr.Markdown("""
288
  ---
289
+ **Pipeline:** Whisper large-v3-turbo → Google Translate → Qwen3-TTS voice clone → Wav2Lip (optional)
290
+ By [@artificialguybr](https://twitter.com/artificialguybr)
291
  """)
292
 
293
  demo.queue()