Captions-word-level-Forced-Alignment

Sleeping

App Files Files Community

abdelhaqueidali commited on 11 days ago

Commit

9a9bca7

verified ·

1 Parent(s): 36b821f

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -61

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import os
 import uuid
 import json
 import re
-import subprocess
 from nemo.collections.asr.models import ASRModel
 from nemo.utils import logging
@@ -329,10 +328,7 @@ def delete_mp4s_except_given_filepath(filepath):
     mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")]
     for mp4_file in mp4_files_in_dir:
         if mp4_file != filepath:
-            try:
-                os.remove(mp4_file)
-            except Exception:
-                pass
 def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newline, progress=gr.Progress()):
@@ -345,8 +341,19 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
     progress(0, desc="Validating input")
     # Ensure only ONE source is used
-    inputs_provided = sum([Microphone is not None, File_Upload is not None, Video_Upload is not None])
     if inputs_provided > 1:
         raise gr.Error("Please use either the microphone, audio file upload, or video upload - not multiple inputs.")
     elif inputs_provided == 0:
@@ -356,57 +363,86 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
     extracted_audio_path = None
     if Microphone is not None:
         file = Microphone
     elif File_Upload is not None:
         file = File_Upload
     else:
-        # Step: Extract audio track from video safely
         progress(0.05, desc="Extracting audio track from video...")
-        # Handle Gradio's potential return structure for Video components
-        vid_path = Video_Upload['video'] if isinstance(Video_Upload, dict) else Video_Upload
-        extracted_audio_path = os.path.abspath(f"extracted_{utt_id}.wav")
-        try:
-            subprocess.run([
-                "ffmpeg", "-y", "-i", vid_path,
-                "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
-                extracted_audio_path
-            ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        except subprocess.CalledProcessError as e:
-            raise gr.Error(f"Error: Could not extract audio from video. FFMPEG output: {e.stderr.decode()}")
         if not os.path.exists(extracted_audio_path):
-            raise gr.Error("Error: Audio extraction failed silently. Ensure the video has a readable audio track.")
         file = extracted_audio_path
-    audio_data, duration = get_audio_data_and_duration(file)
-    # Clean up the extracted temporary audio file
     if extracted_audio_path and os.path.exists(extracted_audio_path):
         os.remove(extracted_audio_path)
     progress(0.1, desc="Loading speech recognition model")
-    model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
-    model = ASRModel.from_pretrained(model_name)
     segments = []
     if subs_file is not None:
-        with open(subs_file.name, 'r', encoding='utf-8') as f:
-            subs_content = f.read()
-        if subs_file.name.lower().endswith('.srt'):
-            segments = parse_srt(subs_content)
-        elif subs_file.name.lower().endswith('.lrc'):
-            segments = parse_lrc(subs_content, duration)
-        else:
-            raise gr.Error("Subtitle file must be an .srt or .lrc file.")
     with tempfile.TemporaryDirectory() as tmpdir:
         manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json")
         if segments:
             progress(0.2, desc="Chunking audio and generating manifest")
             with open(manifest_path, 'w', encoding='utf-8') as fout:
                 for i, seg in enumerate(segments):
                     S_prime, T = get_S_prime_and_T(seg['text'], model_name, model, seg['end'] - seg['start'])
@@ -429,6 +465,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
                     fout.write(f"{json.dumps(data)}\n")
             resegment_text_to_fill_space = False
         else:
             audio_path = os.path.join(tmpdir, f'{utt_id}.wav')
@@ -436,6 +473,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
             if not text:
                 progress(0.2, desc="Transcribing audio")
                 text = model.transcribe([audio_path])[0]
                 if 'hybrid' in model_name:
                     text = text[0]
@@ -451,6 +489,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
                     f"You could try pasting the transcription into the text input box, correcting any"
                     " transcription errors, and clicking 'Submit' again."
                 )
             if split_on_newline:
                 text = "|".join(list(filter(None, text.split("\n"))))
@@ -467,6 +506,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
                 fout.write(f"{json.dumps(data)}\n")
             resegment_text_to_fill_space = "|" not in text
         alignment_config = AlignmentConfig(
             pretrained_name=model_name,
@@ -485,7 +525,15 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
         )
         progress(0.5, desc="Aligning audio")
-        main(alignment_config)
         progress(0.95, desc="Saving generated alignments")
         ass_path = "word_level.ass"
@@ -493,6 +541,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
         segment_ctm_path = "segment_level.ctm"
         if segments:
             merged_ass = ""
             header_written = False
@@ -592,21 +641,26 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
                     f.write(merged_ctm)
         else:
             ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
-            with open(ass_file_for_video, "r", encoding="utf-8") as f:
-                ass_text = f.read()
-            with open(ass_path, "w", encoding="utf-8") as f:
-                f.write(ass_text)
-            with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r", encoding="utf-8") as f:
-                with open(word_ctm_path, "w", encoding="utf-8") as out_f:
-                    out_f.write(f.read())
-            with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r", encoding="utf-8") as f:
-                with open(segment_ctm_path, "w", encoding="utf-8") as out_f:
-                    out_f.write(f.read())
         segments_for_subs = parse_ass_to_segments(ass_text)
         srt_seg_path = "segments.srt"
@@ -629,19 +683,24 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
         with open(elrc_path, "w", encoding="utf-8") as f:
             f.write(generate_elrc(segments_for_subs))
         full_audio_path = os.path.join(tmpdir, "full_audio.wav")
         soundfile.write(full_audio_path, audio_data, SAMPLE_RATE)
-        # Added string quotes to safeguard against spaces in temp directories
         ffmpeg_command = (
             f'ffmpeg -y -i "{full_audio_path}" '
-            '-f lavfi -i color=c=white:s=1280x720:r=50 '
-            '-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p '
             f'-vf "ass=\'{ass_path}\'" '
             f'"{output_video_filepath}"'
         )
-        os.system(ffmpeg_command)
     return (
         output_video_filepath,
         gr.update(value=output_info, visible=True if output_info else False),
@@ -660,10 +719,7 @@ def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newli
 def delete_non_tmp_video(video_path):
     if video_path:
         if os.path.exists(video_path):
-            try:
-                os.remove(video_path)
-            except Exception:
-                pass
     return None
@@ -751,11 +807,11 @@ with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo:
     examples = gr.Examples(
         examples=[
-            ["Voice1410.wav", None, None, example_2],
-            ["Tamazight_For_All.mp3", None, "Tamazight_For_All.srt", ""]
         ],
-        inputs=[audio_file_in, video_file_in, subs_file_in, ref_text]
     )
     demo.queue()
-    demo.launch()

 import uuid
 import json
 import re
 from nemo.collections.asr.models import ASRModel
 from nemo.utils import logging
     mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")]
     for mp4_file in mp4_files_in_dir:
         if mp4_file != filepath:
+            os.remove(mp4_file)
 def align(Microphone, File_Upload, Video_Upload, subs_file, text, split_on_newline, progress=gr.Progress()):
     progress(0, desc="Validating input")
+    # FIX: Handle Video upload properly - extract path from tuple if needed
+    video_path = None
+    if Video_Upload is not None:
+        if isinstance(Video_Upload, (tuple, list)):
+            video_path = Video_Upload[0]  # First element is the file path
+        elif isinstance(Video_Upload, str):
+            video_path = Video_Upload
+        else:
+            video_path = Video_Upload
+        print(f"Video path extracted: {video_path}")
     # Ensure only ONE source is used
+    inputs_provided = sum([Microphone is not None, File_Upload is not None, video_path is not None])
     if inputs_provided > 1:
         raise gr.Error("Please use either the microphone, audio file upload, or video upload - not multiple inputs.")
     elif inputs_provided == 0:
     extracted_audio_path = None
     if Microphone is not None:
         file = Microphone
+        print(f"Using microphone input: {file}")
     elif File_Upload is not None:
         file = File_Upload
+        print(f"Using audio file upload: {file}")
     else:
+        # Step: Extract audio track from video
         progress(0.05, desc="Extracting audio track from video...")
+        extracted_audio_path = f"extracted_{utt_id}.wav"
+        ffmpeg_extract_cmd = f'ffmpeg -y -i "{video_path}" -vn -acodec pcm_s16le -ar 16000 -ac 1 {extracted_audio_path}'
+        print(f"Running FFmpeg command: {ffmpeg_extract_cmd}")
+        # FIX: Add error checking for FFmpeg
+        result = os.system(ffmpeg_extract_cmd)
+        if result != 0:
+            if os.path.exists(extracted_audio_path):
+                os.remove(extracted_audio_path)
+            raise gr.Error("Failed to extract audio from video. Make sure the video file is valid and FFmpeg is installed.")
         if not os.path.exists(extracted_audio_path):
+            raise gr.Error("Failed to extract audio from video. No audio file was generated.")
         file = extracted_audio_path
+        print(f"Audio extracted to: {file}")
+    # FIX: Add validation for audio file
+    try:
+        audio_data, duration = get_audio_data_and_duration(file)
+        print(f"Audio loaded successfully. Duration: {duration:.2f}s")
+    except Exception as e:
+        if extracted_audio_path and os.path.exists(extracted_audio_path):
+            os.remove(extracted_audio_path)
+        raise gr.Error(f"Failed to process audio file: {str(e)}")
+    # Clean up the extracted temporary audio file if created
     if extracted_audio_path and os.path.exists(extracted_audio_path):
         os.remove(extracted_audio_path)
     progress(0.1, desc="Loading speech recognition model")
+    # FIX: Add error handling for model loading
+    try:
+        model_name = "ayymen/stt_zgh_fastconformer_ctc_small"
+        model = ASRModel.from_pretrained(model_name)
+        print(f"Model loaded successfully: {model_name}")
+    except Exception as e:
+        raise gr.Error(f"Failed to load ASR model: {str(e)}")
     segments = []
     if subs_file is not None:
+        progress(0.15, desc="Parsing subtitle file...")
+        # FIX: Handle subs_file properly
+        try:
+            subs_path = subs_file if isinstance(subs_file, str) else subs_file.name
+            print(f"Reading subtitle file: {subs_path}")
+            with open(subs_path, 'r', encoding='utf-8') as f:
+                subs_content = f.read()
+            if subs_path.lower().endswith('.srt'):
+                segments = parse_srt(subs_content)
+                print(f"Parsed {len(segments)} SRT segments")
+            elif subs_path.lower().endswith('.lrc'):
+                segments = parse_lrc(subs_content, duration)
+                print(f"Parsed {len(segments)} LRC segments")
+            else:
+                raise gr.Error("Subtitle file must be an .srt or .lrc file.")
+            if not segments:
+                raise gr.Error("No valid segments found in the subtitle file.")
+        except Exception as e:
+            raise gr.Error(f"Failed to parse subtitle file: {str(e)}")
     with tempfile.TemporaryDirectory() as tmpdir:
         manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json")
         if segments:
             progress(0.2, desc="Chunking audio and generating manifest")
+            print(f"Processing {len(segments)} segments with alignment")
             with open(manifest_path, 'w', encoding='utf-8') as fout:
                 for i, seg in enumerate(segments):
                     S_prime, T = get_S_prime_and_T(seg['text'], model_name, model, seg['end'] - seg['start'])
                     fout.write(f"{json.dumps(data)}\n")
             resegment_text_to_fill_space = False
+            print(f"Manifest created at: {manifest_path}")
         else:
             audio_path = os.path.join(tmpdir, f'{utt_id}.wav')
             if not text:
                 progress(0.2, desc="Transcribing audio")
+                print("No text provided, running ASR transcription...")
                 text = model.transcribe([audio_path])[0]
                 if 'hybrid' in model_name:
                     text = text[0]
                     f"You could try pasting the transcription into the text input box, correcting any"
                     " transcription errors, and clicking 'Submit' again."
                 )
+                print(f"Transcription: {text}")
             if split_on_newline:
                 text = "|".join(list(filter(None, text.split("\n"))))
                 fout.write(f"{json.dumps(data)}\n")
             resegment_text_to_fill_space = "|" not in text
+            print(f"Manifest created at: {manifest_path}")
         alignment_config = AlignmentConfig(
             pretrained_name=model_name,
         )
         progress(0.5, desc="Aligning audio")
+        print("Starting alignment...")
+        # FIX: Add error handling for alignment
+        try:
+            main(alignment_config)
+            print("Alignment completed successfully")
+        except Exception as e:
+            raise gr.Error(f"Alignment failed: {str(e)}")
         progress(0.95, desc="Saving generated alignments")
         ass_path = "word_level.ass"
         segment_ctm_path = "segment_level.ctm"
         if segments:
+            print("Merging chunk alignment results...")
             merged_ass = ""
             header_written = False
                     f.write(merged_ctm)
         else:
+            print("Processing single alignment result...")
             ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass"
+            if os.path.exists(ass_file_for_video):
+                with open(ass_file_for_video, "r", encoding="utf-8") as f:
+                    ass_text = f.read()
+                with open(ass_path, "w", encoding="utf-8") as f:
+                    f.write(ass_text)
+                with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r", encoding="utf-8") as f:
+                    with open(word_ctm_path, "w", encoding="utf-8") as out_f:
+                        out_f.write(f.read())
+                with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r", encoding="utf-8") as f:
+                    with open(segment_ctm_path, "w", encoding="utf-8") as out_f:
+                        out_f.write(f.read())
+            else:
+                raise gr.Error("Alignment did not produce any output files.")
+        print("Generating subtitle formats...")
         segments_for_subs = parse_ass_to_segments(ass_text)
         srt_seg_path = "segments.srt"
         with open(elrc_path, "w", encoding="utf-8") as f:
             f.write(generate_elrc(segments_for_subs))
+        print("Generating output video...")
         full_audio_path = os.path.join(tmpdir, "full_audio.wav")
         soundfile.write(full_audio_path, audio_data, SAMPLE_RATE)
         ffmpeg_command = (
             f'ffmpeg -y -i "{full_audio_path}" '
+            f'-f lavfi -i color=c=white:s=1280x720:r=50 '
+            f'-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p '
             f'-vf "ass=\'{ass_path}\'" '
             f'"{output_video_filepath}"'
         )
+        print(f"Running FFmpeg command: {ffmpeg_command}")
+        result = os.system(ffmpeg_command)
+        if result != 0 or not os.path.exists(output_video_filepath):
+            raise gr.Error("Failed to generate the output video. FFmpeg command failed.")
+    print("Alignment process completed successfully!")
     return (
         output_video_filepath,
         gr.update(value=output_info, visible=True if output_info else False),
 def delete_non_tmp_video(video_path):
     if video_path:
         if os.path.exists(video_path):
+            os.remove(video_path)
     return None
     examples = gr.Examples(
         examples=[
+            ["Voice1410.wav", None, example_2],
+            ["Tamazight_For_All.mp3", "Tamazight_For_All.srt", ""]
         ],
+        inputs=[audio_file_in, subs_file_in, ref_text]
     )
     demo.queue()
+    demo.launch()