abdelhaqueidali commited on
Commit
9a9bca7
·
verified ·
1 Parent(s): 36b821f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -61
app.py CHANGED
@@ -6,7 +6,6 @@ import os
6
  import uuid
7
  import json
8
  import re
9
- import subprocess
10
 
11
  from nemo.collections.asr.models import ASRModel
12
  from nemo.utils import logging
@@ -329,10 +328,7 @@ def delete_mp4s_except_given_filepath(filepath):
329
  mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")]
330
  for mp4_file in mp4_files_in_dir:
331
  if mp4_file != filepath:
332
- try:
333
- os.remove(mp4_file)
334
- except Exception:
335
- pass
336
 
337
 
338
  def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newline, progress=gr.Progress()):
@@ -345,8 +341,19 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
345
 
346
  progress(0, desc="Validating input")
347
 
 
 
 
 
 
 
 
 
 
 
 
348
  # Ensure only ONE source is used
349
- inputs_provided = sum([Microphone is not None, File_Upload is not None, Video_Upload is not None])
350
  if inputs_provided > 1:
351
  raise gr.Error("Please use either the microphone, audio file upload, or video upload - not multiple inputs.")
352
  elif inputs_provided == 0:
@@ -356,57 +363,86 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
356
  extracted_audio_path = None
357
  if Microphone is not None:
358
  file = Microphone
 
359
  elif File_Upload is not None:
360
  file = File_Upload
 
361
  else:
362
- # Step: Extract audio track from video safely
363
  progress(0.05, desc="Extracting audio track from video...")
 
 
 
364
 
365
- # Handle Gradio's potential return structure for Video components
366
- vid_path = Video_Upload['video'] if isinstance(Video_Upload, dict) else Video_Upload
367
- extracted_audio_path = os.path.abspath(f"extracted_{utt_id}.wav")
 
 
 
368
 
369
- try:
370
- subprocess.run([
371
- "ffmpeg", "-y", "-i", vid_path,
372
- "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
373
- extracted_audio_path
374
- ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
375
- except subprocess.CalledProcessError as e:
376
- raise gr.Error(f"Error: Could not extract audio from video. FFMPEG output: {e.stderr.decode()}")
377
-
378
  if not os.path.exists(extracted_audio_path):
379
- raise gr.Error("Error: Audio extraction failed silently. Ensure the video has a readable audio track.")
380
-
381
  file = extracted_audio_path
382
-
383
- audio_data, duration = get_audio_data_and_duration(file)
384
-
385
- # Clean up the extracted temporary audio file
 
 
 
 
 
 
 
 
386
  if extracted_audio_path and os.path.exists(extracted_audio_path):
387
  os.remove(extracted_audio_path)
388
 
389
  progress(0.1, desc="Loading speech recognition model")
390
- model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
391
- model = ASRModel.from_pretrained(model_name)
 
 
 
 
 
 
392
 
393
  segments = []
394
  if subs_file is not None:
395
- with open(subs_file.name, 'r', encoding='utf-8') as f:
396
- subs_content = f.read()
397
-
398
- if subs_file.name.lower().endswith('.srt'):
399
- segments = parse_srt(subs_content)
400
- elif subs_file.name.lower().endswith('.lrc'):
401
- segments = parse_lrc(subs_content, duration)
402
- else:
403
- raise gr.Error("Subtitle file must be an .srt or .lrc file.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
  with tempfile.TemporaryDirectory() as tmpdir:
406
  manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json")
407
 
408
  if segments:
409
  progress(0.2, desc="Chunking audio and generating manifest")
 
 
410
  with open(manifest_path, 'w', encoding='utf-8') as fout:
411
  for i, seg in enumerate(segments):
412
  S_prime, T = get_S_prime_and_T(seg['text'], model_name, model, seg['end'] - seg['start'])
@@ -429,6 +465,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
429
  fout.write(f"{json.dumps(data)}\n")
430
 
431
  resegment_text_to_fill_space = False
 
432
 
433
  else:
434
  audio_path = os.path.join(tmpdir, f'{utt_id}.wav')
@@ -436,6 +473,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
436
 
437
  if not text:
438
  progress(0.2, desc="Transcribing audio")
 
439
  text = model.transcribe([audio_path])[0]
440
  if 'hybrid' in model_name:
441
  text = text[0]
@@ -451,6 +489,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
451
  f"You could try pasting the transcription into the text input box, correcting any"
452
  " transcription errors, and clicking 'Submit' again."
453
  )
 
454
 
455
  if split_on_newline:
456
  text = "|".join(list(filter(None, text.split("\n"))))
@@ -467,6 +506,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
467
  fout.write(f"{json.dumps(data)}\n")
468
 
469
  resegment_text_to_fill_space = "|" not in text
 
470
 
471
  alignment_config = AlignmentConfig(
472
  pretrained_name=model_name,
@@ -485,7 +525,15 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
485
  )
486
 
487
  progress(0.5, desc="Aligning audio")
488
- main(alignment_config)
 
 
 
 
 
 
 
 
489
  progress(0.95, desc="Saving generated alignments")
490
 
491
  ass_path = "word_level.ass"
@@ -493,6 +541,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
493
  segment_ctm_path = "segment_level.ctm"
494
 
495
  if segments:
 
496
  merged_ass = ""
497
  header_written = False
498
 
@@ -592,21 +641,26 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
592
  f.write(merged_ctm)
593
 
594
  else:
 
595
  ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
596
- with open(ass_file_for_video, "r", encoding="utf-8") as f:
597
- ass_text = f.read()
598
- with open(ass_path, "w", encoding="utf-8") as f:
599
- f.write(ass_text)
600
-
601
- with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r", encoding="utf-8") as f:
602
- with open(word_ctm_path, "w", encoding="utf-8") as out_f:
603
- out_f.write(f.read())
604
-
605
- with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r", encoding="utf-8") as f:
606
- with open(segment_ctm_path, "w", encoding="utf-8") as out_f:
607
- out_f.write(f.read())
 
 
 
608
 
609
 
 
610
  segments_for_subs = parse_ass_to_segments(ass_text)
611
 
612
  srt_seg_path = "segments.srt"
@@ -629,19 +683,24 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
629
  with open(elrc_path, "w", encoding="utf-8") as f:
630
  f.write(generate_elrc(segments_for_subs))
631
 
 
632
  full_audio_path = os.path.join(tmpdir, "full_audio.wav")
633
  soundfile.write(full_audio_path, audio_data, SAMPLE_RATE)
634
 
635
- # Added string quotes to safeguard against spaces in temp directories
636
  ffmpeg_command = (
637
  f'ffmpeg -y -i "{full_audio_path}" '
638
- '-f lavfi -i color=c=white:s=1280x720:r=50 '
639
- '-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p '
640
  f'-vf "ass=\'{ass_path}\'" '
641
  f'"{output_video_filepath}"'
642
  )
643
- os.system(ffmpeg_command)
 
 
 
 
644
 
 
645
  return (
646
  output_video_filepath,
647
  gr.update(value=output_info, visible=True if output_info else False),
@@ -660,10 +719,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
660
  def delete_non_tmp_video(video_path):
661
  if video_path:
662
  if os.path.exists(video_path):
663
- try:
664
- os.remove(video_path)
665
- except Exception:
666
- pass
667
  return None
668
 
669
 
@@ -751,11 +807,11 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
751
 
752
  examples = gr.Examples(
753
  examples=[
754
- ["Voice1410.wav", None, None, example_2],
755
- ["Tamazight_For_All.mp3", None, "Tamazight_For_All.srt", ""]
756
  ],
757
- inputs=[audio_file_in, video_file_in, subs_file_in, ref_text]
758
  )
759
 
760
  demo.queue()
761
- demo.launch()
 
6
  import uuid
7
  import json
8
  import re
 
9
 
10
  from nemo.collections.asr.models import ASRModel
11
  from nemo.utils import logging
 
328
  mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")]
329
  for mp4_file in mp4_files_in_dir:
330
  if mp4_file != filepath:
331
+ os.remove(mp4_file)
 
 
 
332
 
333
 
334
  def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newline, progress=gr.Progress()):
 
341
 
342
  progress(0, desc="Validating input")
343
 
344
+ # FIX: Handle Video upload properly - extract path from tuple if needed
345
+ video_path = None
346
+ if Video_Upload is not None:
347
+ if isinstance(Video_Upload, (tuple, list)):
348
+ video_path = Video_Upload[0] # First element is the file path
349
+ elif isinstance(Video_Upload, str):
350
+ video_path = Video_Upload
351
+ else:
352
+ video_path = Video_Upload
353
+ print(f"Video path extracted: {video_path}")
354
+
355
  # Ensure only ONE source is used
356
+ inputs_provided = sum([Microphone is not None, File_Upload is not None, video_path is not None])
357
  if inputs_provided > 1:
358
  raise gr.Error("Please use either the microphone, audio file upload, or video upload - not multiple inputs.")
359
  elif inputs_provided == 0:
 
363
  extracted_audio_path = None
364
  if Microphone is not None:
365
  file = Microphone
366
+ print(f"Using microphone input: {file}")
367
  elif File_Upload is not None:
368
  file = File_Upload
369
+ print(f"Using audio file upload: {file}")
370
  else:
371
+ # Step: Extract audio track from video
372
  progress(0.05, desc="Extracting audio track from video...")
373
+ extracted_audio_path = f"extracted_{utt_id}.wav"
374
+ ffmpeg_extract_cmd = f'ffmpeg -y -i "{video_path}" -vn -acodec pcm_s16le -ar 16000 -ac 1 {extracted_audio_path}'
375
+ print(f"Running FFmpeg command: {ffmpeg_extract_cmd}")
376
 
377
+ # FIX: Add error checking for FFmpeg
378
+ result = os.system(ffmpeg_extract_cmd)
379
+ if result != 0:
380
+ if os.path.exists(extracted_audio_path):
381
+ os.remove(extracted_audio_path)
382
+ raise gr.Error("Failed to extract audio from video. Make sure the video file is valid and FFmpeg is installed.")
383
 
 
 
 
 
 
 
 
 
 
384
  if not os.path.exists(extracted_audio_path):
385
+ raise gr.Error("Failed to extract audio from video. No audio file was generated.")
386
+
387
  file = extracted_audio_path
388
+ print(f"Audio extracted to: {file}")
389
+
390
+ # FIX: Add validation for audio file
391
+ try:
392
+ audio_data, duration = get_audio_data_and_duration(file)
393
+ print(f"Audio loaded successfully. Duration: {duration:.2f}s")
394
+ except Exception as e:
395
+ if extracted_audio_path and os.path.exists(extracted_audio_path):
396
+ os.remove(extracted_audio_path)
397
+ raise gr.Error(f"Failed to process audio file: {str(e)}")
398
+
399
+ # Clean up the extracted temporary audio file if created
400
  if extracted_audio_path and os.path.exists(extracted_audio_path):
401
  os.remove(extracted_audio_path)
402
 
403
  progress(0.1, desc="Loading speech recognition model")
404
+
405
+ # FIX: Add error handling for model loading
406
+ try:
407
+ model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
408
+ model = ASRModel.from_pretrained(model_name)
409
+ print(f"Model loaded successfully: {model_name}")
410
+ except Exception as e:
411
+ raise gr.Error(f"Failed to load ASR model: {str(e)}")
412
 
413
  segments = []
414
  if subs_file is not None:
415
+ progress(0.15, desc="Parsing subtitle file...")
416
+ # FIX: Handle subs_file properly
417
+ try:
418
+ subs_path = subs_file if isinstance(subs_file, str) else subs_file.name
419
+ print(f"Reading subtitle file: {subs_path}")
420
+
421
+ with open(subs_path, 'r', encoding='utf-8') as f:
422
+ subs_content = f.read()
423
+
424
+ if subs_path.lower().endswith('.srt'):
425
+ segments = parse_srt(subs_content)
426
+ print(f"Parsed {len(segments)} SRT segments")
427
+ elif subs_path.lower().endswith('.lrc'):
428
+ segments = parse_lrc(subs_content, duration)
429
+ print(f"Parsed {len(segments)} LRC segments")
430
+ else:
431
+ raise gr.Error("Subtitle file must be an .srt or .lrc file.")
432
+
433
+ if not segments:
434
+ raise gr.Error("No valid segments found in the subtitle file.")
435
+
436
+ except Exception as e:
437
+ raise gr.Error(f"Failed to parse subtitle file: {str(e)}")
438
 
439
  with tempfile.TemporaryDirectory() as tmpdir:
440
  manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json")
441
 
442
  if segments:
443
  progress(0.2, desc="Chunking audio and generating manifest")
444
+ print(f"Processing {len(segments)} segments with alignment")
445
+
446
  with open(manifest_path, 'w', encoding='utf-8') as fout:
447
  for i, seg in enumerate(segments):
448
  S_prime, T = get_S_prime_and_T(seg['text'], model_name, model, seg['end'] - seg['start'])
 
465
  fout.write(f"{json.dumps(data)}\n")
466
 
467
  resegment_text_to_fill_space = False
468
+ print(f"Manifest created at: {manifest_path}")
469
 
470
  else:
471
  audio_path = os.path.join(tmpdir, f'{utt_id}.wav')
 
473
 
474
  if not text:
475
  progress(0.2, desc="Transcribing audio")
476
+ print("No text provided, running ASR transcription...")
477
  text = model.transcribe([audio_path])[0]
478
  if 'hybrid' in model_name:
479
  text = text[0]
 
489
  f"You could try pasting the transcription into the text input box, correcting any"
490
  " transcription errors, and clicking 'Submit' again."
491
  )
492
+ print(f"Transcription: {text}")
493
 
494
  if split_on_newline:
495
  text = "|".join(list(filter(None, text.split("\n"))))
 
506
  fout.write(f"{json.dumps(data)}\n")
507
 
508
  resegment_text_to_fill_space = "|" not in text
509
+ print(f"Manifest created at: {manifest_path}")
510
 
511
  alignment_config = AlignmentConfig(
512
  pretrained_name=model_name,
 
525
  )
526
 
527
  progress(0.5, desc="Aligning audio")
528
+ print("Starting alignment...")
529
+
530
+ # FIX: Add error handling for alignment
531
+ try:
532
+ main(alignment_config)
533
+ print("Alignment completed successfully")
534
+ except Exception as e:
535
+ raise gr.Error(f"Alignment failed: {str(e)}")
536
+
537
  progress(0.95, desc="Saving generated alignments")
538
 
539
  ass_path = "word_level.ass"
 
541
  segment_ctm_path = "segment_level.ctm"
542
 
543
  if segments:
544
+ print("Merging chunk alignment results...")
545
  merged_ass = ""
546
  header_written = False
547
 
 
641
  f.write(merged_ctm)
642
 
643
  else:
644
+ print("Processing single alignment result...")
645
  ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
646
+ if os.path.exists(ass_file_for_video):
647
+ with open(ass_file_for_video, "r", encoding="utf-8") as f:
648
+ ass_text = f.read()
649
+ with open(ass_path, "w", encoding="utf-8") as f:
650
+ f.write(ass_text)
651
+
652
+ with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r", encoding="utf-8") as f:
653
+ with open(word_ctm_path, "w", encoding="utf-8") as out_f:
654
+ out_f.write(f.read())
655
+
656
+ with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r", encoding="utf-8") as f:
657
+ with open(segment_ctm_path, "w", encoding="utf-8") as out_f:
658
+ out_f.write(f.read())
659
+ else:
660
+ raise gr.Error("Alignment did not produce any output files.")
661
 
662
 
663
+ print("Generating subtitle formats...")
664
  segments_for_subs = parse_ass_to_segments(ass_text)
665
 
666
  srt_seg_path = "segments.srt"
 
683
  with open(elrc_path, "w", encoding="utf-8") as f:
684
  f.write(generate_elrc(segments_for_subs))
685
 
686
+ print("Generating output video...")
687
  full_audio_path = os.path.join(tmpdir, "full_audio.wav")
688
  soundfile.write(full_audio_path, audio_data, SAMPLE_RATE)
689
 
 
690
  ffmpeg_command = (
691
  f'ffmpeg -y -i "{full_audio_path}" '
692
+ f'-f lavfi -i color=c=white:s=1280x720:r=50 '
693
+ f'-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p '
694
  f'-vf "ass=\'{ass_path}\'" '
695
  f'"{output_video_filepath}"'
696
  )
697
+ print(f"Running FFmpeg command: {ffmpeg_command}")
698
+ result = os.system(ffmpeg_command)
699
+
700
+ if result != 0 or not os.path.exists(output_video_filepath):
701
+ raise gr.Error("Failed to generate the output video. FFmpeg command failed.")
702
 
703
+ print("Alignment process completed successfully!")
704
  return (
705
  output_video_filepath,
706
  gr.update(value=output_info, visible=True if output_info else False),
 
719
  def delete_non_tmp_video(video_path):
720
  if video_path:
721
  if os.path.exists(video_path):
722
+ os.remove(video_path)
 
 
 
723
  return None
724
 
725
 
 
807
 
808
  examples = gr.Examples(
809
  examples=[
810
+ ["Voice1410.wav", None, example_2],
811
+ ["Tamazight_For_All.mp3", "Tamazight_For_All.srt", ""]
812
  ],
813
+ inputs=[audio_file_in, subs_file_in, ref_text]
814
  )
815
 
816
  demo.queue()
817
+ demo.launch()