| import gradio as gr |
| import librosa |
| import soundfile |
| import tempfile |
| import os |
| import uuid |
| import json |
| import re |
|
|
| from nemo.collections.asr.models import ASRModel |
| from nemo.utils import logging |
|
|
| from align import main, AlignmentConfig, ASSFileConfig |
|
|
|
|
| SAMPLE_RATE = 16000 |
|
|
| logging.setLevel(logging.INFO) |
|
|
| |
|
|
| def format_srt_time(secs): |
| h = int(secs // 3600) |
| m = int((secs % 3600) // 60) |
| s = int(secs % 60) |
| ms = int(round((secs % 1) * 1000)) |
| if ms == 1000: |
| ms = 0 |
| s += 1 |
| return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" |
|
|
| def format_lrc_time(secs): |
| m = int(secs // 60) |
| s = int(secs % 60) |
| cs = int(round((secs % 1) * 100)) |
| if cs == 100: |
| cs = 0 |
| s += 1 |
| return f"{m:02d}:{s:02d}.{cs:02d}" |
|
|
| def build_color_map(raw_text): |
| color_map = [] |
| current_color = "default" |
| regex = re.compile(r'(\{[^}]+\})|([^{]+)') |
| for match in regex.finditer(raw_text): |
| tag = match.group(1) |
| text = match.group(2) |
| if tag: |
| c_match = re.search(r'\\c&H([0-9a-fA-F]+)&', tag, re.IGNORECASE) |
| if c_match: |
| current_color = c_match.group(1).lower() |
| elif text: |
| for _ in text: |
| color_map.append(current_color) |
| return color_map |
|
|
| def parse_ass_to_segments(text): |
| lines = text.split('\n') |
| dialogues = [] |
| |
| for line in lines: |
| if line.strip().startswith('Dialogue:'): |
| parts = line[10:].split(',') |
| start = parse_ass_time(parts[1]) |
| end = parse_ass_time(parts[2]) |
| raw_text = ','.join(parts[9:]) |
| clean_text_raw = re.sub(r'\{[^}]+\}', '', raw_text) |
| clean_text = re.sub(r'\\N', ' ', clean_text_raw) |
| clean_text = re.sub(r'\s+', ' ', clean_text).strip() |
| dialogues.append({ |
| 'start': start, 'end': end, |
| 'rawText': raw_text, |
| 'cleanTextRaw': clean_text_raw, |
| 'cleanText': clean_text |
| }) |
|
|
| segments = [] |
| current_segment = None |
|
|
| for d in dialogues: |
| if not current_segment or current_segment['cleanText'] != d['cleanText']: |
| if current_segment: |
| segments.append(current_segment) |
| current_segment = { |
| 'cleanText': d['cleanText'], |
| 'cleanTextRaw': d['cleanTextRaw'], |
| 'startTime': d['start'], |
| 'endTime': d['end'], |
| 'slices': [d] |
| } |
| else: |
| current_segment['slices'].append(d) |
| current_segment['endTime'] = d['end'] |
| |
| if current_segment: |
| segments.append(current_segment) |
|
|
| for seg in segments: |
| if not seg['slices']: |
| continue |
| base_color_map = build_color_map(seg['slices'][0]['rawText']) |
| if not base_color_map: |
| seg['words'] = [] |
| continue |
| |
| base_color = base_color_map[-1] |
|
|
| words = [] |
| for match in re.finditer(r'\S+', seg['cleanTextRaw']): |
| txt = match.group(0).replace('\\N', '') |
| if txt.strip(): |
| words.append({ |
| 'text': txt, |
| 'startIndex': match.start(), |
| 'isFirstOfSegment': len(words) == 0 |
| }) |
|
|
| for w in words: |
| char_pos = w['startIndex'] |
| c0 = base_color_map[char_pos] if char_pos < len(base_color_map) else "default" |
| w_start = seg['startTime'] |
| w_end = seg['endTime'] |
|
|
| transitions = [] |
| prev_color = c0 |
|
|
| for i in range(1, len(seg['slices'])): |
| slice_d = seg['slices'][i] |
| cmap = build_color_map(slice_d['rawText']) |
| c_curr = cmap[char_pos] if char_pos < len(cmap) else "default" |
| if c_curr != prev_color: |
| transitions.append({'time': slice_d['start'], 'color': c_curr}) |
| prev_color = c_curr |
|
|
| if c0 == base_color: |
| w_start = transitions[0]['time'] if len(transitions) > 0 else seg['startTime'] |
| w_end = transitions[1]['time'] if len(transitions) > 1 else seg['endTime'] |
| else: |
| w_start = seg['startTime'] |
| w_end = transitions[0]['time'] if len(transitions) > 0 else seg['endTime'] |
|
|
| w['startTime'] = w_start |
| w['endTime'] = w_end |
| |
| |
| |
| if words: |
| words[0]['startTime'] = seg['startTime'] |
| words[-1]['endTime'] = seg['endTime'] |
|
|
| seg['words'] = words |
|
|
| return segments |
|
|
| def generate_srt_segments(segments): |
| out = [] |
| counter = 1 |
| for seg in segments: |
| out.append(str(counter)) |
| out.append(f"{format_srt_time(seg['startTime'])} --> {format_srt_time(seg['endTime'])}") |
| out.append(seg['cleanText']) |
| out.append("") |
| counter += 1 |
| return "\n".join(out).strip() |
|
|
| def generate_srt_word_by_word(segments): |
| out = [] |
| counter = 1 |
| for seg in segments: |
| for w in seg['words']: |
| out.append(str(counter)) |
| out.append(f"{format_srt_time(w['startTime'])} --> {format_srt_time(w['endTime'])}") |
| out.append(w['text']) |
| out.append("") |
| counter += 1 |
| return "\n".join(out).strip() |
|
|
| def generate_srt_additive(segments): |
| out = [] |
| counter = 1 |
| for seg in segments: |
| accumulated = [] |
| for i, w in enumerate(seg['words']): |
| out.append(str(counter)) |
| actual_start = w['startTime'] if i == 0 else seg['words'][i-1]['endTime'] |
| out.append(f"{format_srt_time(actual_start)} --> {format_srt_time(w['endTime'])}") |
| accumulated.append(w['text']) |
| out.append(" ".join(accumulated)) |
| out.append("") |
| counter += 1 |
| return "\n".join(out).strip() |
|
|
| def generate_lrc(segments): |
| out = [] |
| for seg in segments: |
| out.append(f"[{format_lrc_time(seg['startTime'])}]{seg['cleanText']}") |
| out.append(f"[{format_lrc_time(seg['endTime'])}]") |
| return "\n".join(out).strip() |
|
|
| def generate_elrc(segments): |
| out = [] |
| for seg in segments: |
| line = f"[{format_lrc_time(seg['startTime'])}]" |
| for i, w in enumerate(seg['words']): |
| if i == 0: |
| line += f"<{format_lrc_time(w['startTime'])}>{w['text']}" |
| else: |
| line += f" <{format_lrc_time(w['startTime'])}>{w['text']}" |
| out.append(line) |
| out.append(f"[{format_lrc_time(seg['endTime'])}]") |
| return "\n".join(out).strip() |
|
|
| |
|
|
| def parse_srt(content): |
| pattern = re.compile(r'\d+\n(\d{2}:\d{2}:\d{2},\d{3}) --> (\d{2}:\d{2}:\d{2},\d{3})\n((?:(?!\n\n).)*)', re.DOTALL) |
| matches = pattern.findall(content + "\n\n") |
| segments = [] |
| |
| def time_to_sec(t_str): |
| h, m, s_ms = t_str.split(':') |
| s, ms = s_ms.split(',') |
| return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0 |
| |
| for match in matches: |
| start = time_to_sec(match[0]) |
| end = time_to_sec(match[1]) |
| text = match[2].replace('\n', ' ').strip() |
| segments.append({"start": start, "end": end, "text": text}) |
| |
| return segments |
|
|
| def parse_lrc(content, audio_duration): |
| lines = content.split('\n') |
| pattern = re.compile(r'\[(\d{2}):(\d{2}\.\d{2,3})\](.*)') |
| raw_markers = [] |
| |
| for line in lines: |
| match = pattern.match(line.strip()) |
| if match: |
| m, s, text = match.groups() |
| start = int(m) * 60 + float(s) |
| raw_markers.append({"start": start, "text": text.strip()}) |
| |
| segments = [] |
| for i in range(len(raw_markers)): |
| current = raw_markers[i] |
| text = current["text"] |
| |
| if text and text != "#": |
| end_time = audio_duration |
| if i + 1 < len(raw_markers): |
| end_time = raw_markers[i+1]["start"] |
| |
| if end_time > current["start"]: |
| segments.append({ |
| "start": current["start"], |
| "end": end_time, |
| "text": text |
| }) |
| |
| return segments |
|
|
| def parse_ass_time(t_str): |
| h, m, s = t_str.strip().split(':') |
| return float(h) * 3600 + float(m) * 60 + float(s) |
|
|
| def format_ass_time(sec): |
| sec = max(0.0, sec) |
| nh = int(sec // 3600) |
| nm = int((sec % 3600) // 60) |
| ns = sec % 60 |
| return f"{nh:d}:{nm:02d}:{ns:05.2f}" |
|
|
| |
|
|
| def get_audio_data_and_duration(file): |
| data, sr = librosa.load(file) |
|
|
| if sr != SAMPLE_RATE: |
| data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) |
|
|
| |
| data = librosa.to_mono(data) |
|
|
| duration = librosa.get_duration(y=data, sr=SAMPLE_RATE) |
| return data, duration |
|
|
|
|
| def get_char_tokens(text, model): |
| tokens = [] |
| for character in text: |
| if character in model.decoder.vocabulary: |
| tokens.append(model.decoder.vocabulary.index(character)) |
| else: |
| tokens.append(len(model.decoder.vocabulary)) |
|
|
| return tokens |
|
|
|
|
| def get_S_prime_and_T(text, model_name, model, audio_duration): |
| if "citrinet" in model_name or "_fastconformer_" in model_name: |
| output_timestep_duration = 0.08 |
| elif "_conformer_" in model_name: |
| output_timestep_duration = 0.04 |
| elif "quartznet" in model_name: |
| output_timestep_duration = 0.02 |
| else: |
| raise RuntimeError("unexpected model name") |
|
|
| T = int(audio_duration / output_timestep_duration) + 1 |
|
|
| if hasattr(model, 'tokenizer'): |
| all_tokens = model.tokenizer.text_to_ids(text) |
| elif hasattr(model.decoder, "vocabulary"): |
| all_tokens = get_char_tokens(text, model) |
| else: |
| raise RuntimeError("cannot obtain tokens from this model") |
|
|
| n_token_repetitions = 0 |
| for i_tok in range(1, len(all_tokens)): |
| if all_tokens[i_tok] == all_tokens[i_tok - 1]: |
| n_token_repetitions += 1 |
|
|
| S_prime = len(all_tokens) + n_token_repetitions |
|
|
| return S_prime, T |
|
|
|
|
| def delete_mp4s_except_given_filepath(filepath): |
| files_in_dir = os.listdir() |
| mp4_files_in_dir = [x for x in files_in_dir if x.endswith(".mp4")] |
| for mp4_file in mp4_files_in_dir: |
| if mp4_file != filepath: |
| os.remove(mp4_file) |
|
|
|
|
| def align(Microphone, File_Upload, subs_file, text, split_on_newline, progress=gr.Progress()): |
| utt_id = uuid.uuid4() |
| output_video_filepath = f"{utt_id}.mp4" |
| delete_mp4s_except_given_filepath(output_video_filepath) |
|
|
| output_info = "" |
| ass_text = "" |
|
|
| progress(0, desc="Validating input") |
|
|
| if (Microphone is not None) and (File_Upload is not None): |
| raise gr.Error("Please use either the microphone or file upload input - not both") |
| elif (Microphone is None) and (File_Upload is None): |
| raise gr.Error("You have to either use the microphone or upload an audio file") |
| elif Microphone is not None: |
| file = Microphone |
| else: |
| file = File_Upload |
|
|
| audio_data, duration = get_audio_data_and_duration(file) |
|
|
| progress(0.1, desc="Loading speech recognition model") |
| model_name = "ayymen/stt_zgh_fastconformer_ctc_small" |
| model = ASRModel.from_pretrained(model_name) |
|
|
| segments = [] |
| if subs_file is not None: |
| with open(subs_file.name, 'r', encoding='utf-8') as f: |
| subs_content = f.read() |
| |
| if subs_file.name.lower().endswith('.srt'): |
| segments = parse_srt(subs_content) |
| elif subs_file.name.lower().endswith('.lrc'): |
| segments = parse_lrc(subs_content, duration) |
| else: |
| raise gr.Error("Subtitle file must be an .srt or .lrc file.") |
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| manifest_path = os.path.join(tmpdir, f"{utt_id}_manifest.json") |
| |
| if segments: |
| progress(0.2, desc="Chunking audio and generating manifest") |
| with open(manifest_path, 'w', encoding='utf-8') as fout: |
| for i, seg in enumerate(segments): |
| S_prime, T = get_S_prime_and_T(seg['text'], model_name, model, seg['end'] - seg['start']) |
| if S_prime > T: |
| raise gr.Error(f"Segment {i} ('{seg['text']}') has too much text for its audio duration ({seg['end'] - seg['start']:.2f}s).") |
|
|
| start_sample = int(seg['start'] * SAMPLE_RATE) |
| end_sample = int(seg['end'] * SAMPLE_RATE) |
| chunk_data = audio_data[start_sample:end_sample] |
|
|
| chunk_path = os.path.join(tmpdir, f"{utt_id}_{i:04d}.wav") |
| soundfile.write(chunk_path, chunk_data, SAMPLE_RATE) |
|
|
| seg_text = seg['text'].replace('\n', '|') if split_on_newline else seg['text'].replace('\n', ' ') |
|
|
| data = { |
| "audio_filepath": chunk_path, |
| "text": seg_text, |
| } |
| fout.write(f"{json.dumps(data)}\n") |
| |
| resegment_text_to_fill_space = False |
|
|
| else: |
| audio_path = os.path.join(tmpdir, f'{utt_id}.wav') |
| soundfile.write(audio_path, audio_data, SAMPLE_RATE) |
|
|
| if not text: |
| progress(0.2, desc="Transcribing audio") |
| text = model.transcribe([audio_path])[0] |
| if 'hybrid' in model_name: |
| text = text[0] |
|
|
| if text == "": |
| raise gr.Error("ERROR: the ASR model did not detect any speech. Please upload audio with speech.") |
|
|
| output_info += ( |
| "You did not enter any input text, so the ASR model's transcription will be used:\n" |
| "--------------------------\n" |
| f"{text}\n" |
| "--------------------------\n" |
| f"You could try pasting the transcription into the text input box, correcting any" |
| " transcription errors, and clicking 'Submit' again." |
| ) |
|
|
| if split_on_newline: |
| text = "|".join(list(filter(None, text.split("\n")))) |
| |
| S_prime, T = get_S_prime_and_T(text, model_name, model, duration) |
| if S_prime > T: |
| raise gr.Error("The number of tokens in the input text is too long compared to the duration of the audio.") |
|
|
| with open(manifest_path, 'w', encoding='utf-8') as fout: |
| data = { |
| "audio_filepath": audio_path, |
| "text": text, |
| } |
| fout.write(f"{json.dumps(data)}\n") |
|
|
| resegment_text_to_fill_space = "|" not in text |
|
|
|
|
| alignment_config = AlignmentConfig( |
| pretrained_name=model_name, |
| manifest_filepath=manifest_path, |
| output_dir=f"{tmpdir}/nfa_output/", |
| audio_filepath_parts_in_utt_id=1, |
| batch_size=1, |
| use_local_attention=True, |
| additional_segment_grouping_separator="|", |
| save_output_file_formats=["ass", "ctm"], |
| ass_file_config=ASSFileConfig( |
| fontsize=45, |
| resegment_text_to_fill_space=resegment_text_to_fill_space, |
| max_lines_per_segment=4, |
| ), |
| ) |
|
|
| progress(0.5, desc="Aligning audio") |
| main(alignment_config) |
| progress(0.95, desc="Saving generated alignments") |
|
|
|
|
| ass_path = "word_level.ass" |
| word_ctm_path = "word_level.ctm" |
| segment_ctm_path = "segment_level.ctm" |
| |
| if segments: |
| |
| merged_ass = "" |
| header_written = False |
| |
| for i, seg in enumerate(segments): |
| chunk_ass_path = f"{tmpdir}/nfa_output/ass/words/{utt_id}_{i:04d}.ass" |
| if os.path.exists(chunk_ass_path): |
| chunk_lines = [] |
| |
| with open(chunk_ass_path, "r", encoding='utf-8') as f: |
| for line in f: |
| if line.startswith("Dialogue:"): |
| parts = line.split(",", 9) |
| if len(parts) >= 10: |
| chunk_lines.append(parts) |
| elif not header_written: |
| merged_ass += line |
| header_written = True |
| |
| if chunk_lines: |
| for j, parts in enumerate(chunk_lines): |
| local_start = parse_ass_time(parts[1]) |
| local_end = parse_ass_time(parts[2]) |
| |
| global_start = local_start + seg['start'] |
| global_end = local_end + seg['start'] |
| |
| if j == 0: |
| global_start = seg['start'] |
| |
| if j == len(chunk_lines) - 1: |
| global_end = seg['end'] |
| |
| if i < len(segments) - 1: |
| next_start = segments[i+1]['start'] |
| if global_end >= next_start: |
| global_end = next_start - 0.05 |
| |
| if global_start >= global_end: |
| global_start = global_end - 0.01 |
|
|
| parts[1] = format_ass_time(global_start) |
| parts[2] = format_ass_time(global_end) |
| |
| merged_ass += ",".join(parts) |
| |
| with open(ass_path, "w", encoding="utf-8") as f: |
| f.write(merged_ass) |
| |
| ass_text = merged_ass |
|
|
| |
| for ctm_type, out_path in [("words", word_ctm_path), ("segments", segment_ctm_path)]: |
| merged_ctm = "" |
| for i, seg in enumerate(segments): |
| chunk_ctm_path = f"{tmpdir}/nfa_output/ctm/{ctm_type}/{utt_id}_{i:04d}.ctm" |
| if os.path.exists(chunk_ctm_path): |
| lines = [] |
| with open(chunk_ctm_path, "r", encoding='utf-8') as f: |
| for line in f: |
| parts = line.strip().split() |
| if len(parts) >= 5: |
| lines.append(parts) |
| |
| for j, parts in enumerate(lines): |
| parts[0] = str(utt_id) |
| l_start = float(parts[2]) |
| l_dur = float(parts[3]) |
| l_end = l_start + l_dur |
| |
| g_start = l_start + seg['start'] |
| g_end = l_end + seg['start'] |
| |
| if ctm_type == "segments": |
| g_start = seg['start'] |
| g_end = seg['end'] |
| if i < len(segments) - 1: |
| next_start = segments[i+1]['start'] |
| if g_end >= next_start: |
| g_end = next_start - 0.05 |
| else: |
| if j == 0: |
| g_start = seg['start'] |
| if j == len(lines) - 1: |
| g_end = seg['end'] |
| if i < len(segments) - 1: |
| next_start = segments[i+1]['start'] |
| if g_end >= next_start: |
| g_end = next_start - 0.05 |
| |
| if g_start >= g_end: |
| g_start = g_end - 0.01 |
| |
| parts[2] = f"{g_start:.2f}" |
| parts[3] = f"{(g_end - g_start):.2f}" |
| merged_ctm += " ".join(parts) + "\n" |
| |
| with open(out_path, "w", encoding="utf-8") as f: |
| f.write(merged_ctm) |
|
|
| else: |
| ass_file_for_video = f"{tmpdir}/nfa_output/ass/words/{utt_id}.ass" |
| with open(ass_file_for_video, "r", encoding="utf-8") as f: |
| ass_text = f.read() |
| with open(ass_path, "w", encoding="utf-8") as f: |
| f.write(ass_text) |
|
|
| with open(f"{tmpdir}/nfa_output/ctm/words/{utt_id}.ctm", "r", encoding="utf-8") as f: |
| with open(word_ctm_path, "w", encoding="utf-8") as out_f: |
| out_f.write(f.read()) |
|
|
| with open(f"{tmpdir}/nfa_output/ctm/segments/{utt_id}.ctm", "r", encoding="utf-8") as f: |
| with open(segment_ctm_path, "w", encoding="utf-8") as out_f: |
| out_f.write(f.read()) |
| |
|
|
| |
| segments_for_subs = parse_ass_to_segments(ass_text) |
|
|
| srt_seg_path = "segments.srt" |
| with open(srt_seg_path, "w", encoding="utf-8") as f: |
| f.write(generate_srt_segments(segments_for_subs)) |
|
|
| srt_word_path = "word_by_word.srt" |
| with open(srt_word_path, "w", encoding="utf-8") as f: |
| f.write(generate_srt_word_by_word(segments_for_subs)) |
|
|
| srt_add_path = "additive.srt" |
| with open(srt_add_path, "w", encoding="utf-8") as f: |
| f.write(generate_srt_additive(segments_for_subs)) |
|
|
| lrc_path = "segments.lrc" |
| with open(lrc_path, "w", encoding="utf-8") as f: |
| f.write(generate_lrc(segments_for_subs)) |
|
|
| elrc_path = "word_level.elrc" |
| with open(elrc_path, "w", encoding="utf-8") as f: |
| f.write(generate_elrc(segments_for_subs)) |
|
|
| |
| full_audio_path = os.path.join(tmpdir, "full_audio.wav") |
| soundfile.write(full_audio_path, audio_data, SAMPLE_RATE) |
|
|
| ffmpeg_command = ( |
| f"ffmpeg -y -i {full_audio_path} " |
| "-f lavfi -i color=c=white:s=1280x720:r=50 " |
| "-crf 1 -shortest -vcodec libx264 -pix_fmt yuv420p " |
| f"-vf \"ass='{ass_path}'\" " |
| f"{output_video_filepath}" |
| ) |
| os.system(ffmpeg_command) |
|
|
| return ( |
| output_video_filepath, |
| gr.update(value=output_info, visible=True if output_info else False), |
| output_video_filepath, |
| gr.update(value=ass_path, visible=True), |
| gr.update(value=word_ctm_path, visible=True), |
| gr.update(value=segment_ctm_path, visible=True), |
| gr.update(value=srt_seg_path, visible=True), |
| gr.update(value=srt_word_path, visible=True), |
| gr.update(value=srt_add_path, visible=True), |
| gr.update(value=lrc_path, visible=True), |
| gr.update(value=elrc_path, visible=True) |
| ) |
|
|
|
|
| def delete_non_tmp_video(video_path): |
| if video_path: |
| if os.path.exists(video_path): |
| os.remove(video_path) |
| return None |
|
|
|
|
| with gr.Blocks(title="NeMo Forced Aligner", theme="huggingface") as demo: |
| non_tmp_output_video_filepath = gr.State([]) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown("# NeMo Forced Aligner") |
| gr.Markdown( |
| "Demo for [NeMo Forced Aligner](https://github.com/NVIDIA/NeMo/tree/main/tools/nemo_forced_aligner) (NFA). " |
| "Upload audio in Tamazight and (optionally) the text spoken in the audio to generate a video where each part of the text will be highlighted as it is spoken. " |
| "**Now supports syncing with pre-timed SRT or LRC files and generates all subtitle formats instantly!**", |
| ) |
|
|
| with gr.Row(): |
| with gr.Column(scale=1): |
| gr.Markdown("## Input") |
| mic_in = gr.Audio(sources=["microphone"], type='filepath', label="Microphone input") |
| audio_file_in = gr.Audio(sources=["upload"], type='filepath', label="File upload") |
| |
| subs_file_in = gr.File(label="[Optional] Upload an SRT or LRC file to constrain alignment to predefined timestamps", file_types=[".srt", ".lrc"]) |
| |
| ref_text = gr.Textbox( |
| label="[Optional] The reference text. Use '|' separators to specify which text will appear together. " |
| "Leave this field blank to use an ASR model's transcription as the reference text instead. (Ignored if SRT/LRC is uploaded)" |
| ) |
| split_on_newline = gr.Checkbox( |
| True, |
| label="Separate text on new lines", |
| ) |
|
|
| submit_button = gr.Button("Submit") |
|
|
| with gr.Column(scale=1): |
| gr.Markdown("## Output Video") |
| video_out = gr.Video(label="Output Video") |
| text_out = gr.Textbox(label="Output Info", visible=False) |
| |
| gr.Markdown("## Download Subtitle Files") |
| with gr.Row(): |
| ass_file = gr.File(label="ASS (Karaoke)", visible=False) |
| word_ctm_file = gr.File(label="CTM (Word-level)", visible=False) |
| segment_ctm_file = gr.File(label="CTM (Segment-level)", visible=False) |
| |
| with gr.Row(): |
| srt_seg_file = gr.File(label="SRT (Segments)", visible=False) |
| srt_word_file = gr.File(label="SRT (Word-by-Word)", visible=False) |
| srt_add_file = gr.File(label="SRT (Additive)", visible=False) |
| |
| with gr.Row(): |
| lrc_file = gr.File(label="LRC (Segments)", visible=False) |
| elrc_file = gr.File(label="ELRC (Word-level)", visible=False) |
|
|
| with gr.Row(): |
| gr.HTML( |
| "<p style='text-align: center'>" |
| "Tutorial: <a href='https://colab.research.google.com/github/NVIDIA/NeMo/blob/main/tutorials/tools/NeMo_Forced_Aligner_Tutorial.ipynb' target='_blank'>\"How to use NFA?\"</a> 🚀 | " |
| "Blog post: <a href='https://nvidia.github.io/NeMo/blogs/2023/2023-08-forced-alignment/' target='_blank'>\"How does forced alignment work?\"</a> 📚 | " |
| "NFA <a href='https://github.com/NVIDIA/NeMo/tree/main/tools/nemo_forced_aligner/' target='_blank'>Github page</a> 👩💻" |
| "</p>" |
| ) |
|
|
| submit_button.click( |
| fn=align, |
| inputs=[mic_in, audio_file_in, subs_file_in, ref_text, split_on_newline], |
| outputs=[ |
| video_out, text_out, non_tmp_output_video_filepath, |
| ass_file, word_ctm_file, segment_ctm_file, |
| srt_seg_file, srt_word_file, srt_add_file, lrc_file, elrc_file |
| ], |
| ).then( |
| fn=delete_non_tmp_video, inputs=[non_tmp_output_video_filepath], outputs=None, |
| ) |
| |
| example_2 = """ⵜⴰⴽⵟⵟⵓⵎⵜ ⵏ ⵜⵙⴰⴷⵓⴼⵜ. |
| ⵙ ⵉⵙⵎ ⵏ ⵕⴱⴱⵉ ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ. |
| ⴰⵎⵓⵢ ⵉ ⵕⴱⴱⵉ ⵍⵍⵉ ⵎⵓ ⵜⴳⴰ ⵜⵓⵍⵖⵉⵜ ⵜⵉⵏⵏⵙ, ⵕⴱⴱⵉ ⵏ ⵉⵖⵥⵡⴰⵕⵏ, ⴽⵔⴰ ⴳⴰⵏ. |
| ⴰⵎⴰⵍⵍⴰⵢ ⴰⵎⵙⵎⵓⵍⵍⵓ, ⵖ ⵜⵎⵣⵡⴰⵔⵓⵜ ⵓⵍⴰ ⵖ ⵜⵎⴳⴳⴰⵔⵓⵜ. |
| ⴰⴳⵍⵍⵉⴷ ⵏ ⵡⴰⵙⵙ ⵏ ⵓⴼⵔⴰ, ⴰⵙⵙ ⵏ ⵓⵙⵙⵃⵙⵓ, ⴽⵔⴰⵉⴳⴰⵜ ⵢⴰⵏ ⴷ ⵎⴰⴷ ⵉⵙⴽⵔ. |
| ⵀⴰ ⵏⵏ ⴽⵢⵢⵉ ⴽⴰ ⵙ ⵏⵙⵙⵓⵎⴷ, ⴷ ⴽⵢⵢⵉ ⴽⴰ ⴰⴷ ⵏⵎⵎⵜⵔ. |
| ⵙⵎⵓⵏ ⴰⵖ, ⵜⵎⵍⵜ ⴰⵖ, ⴰⵖⴰⵔⴰⵙ ⵢⵓⵖⴷⵏ. |
| ⴰⵖⴰⵔⴰⵙ ⵏ ⵖⵡⵉⵍⵍⵉ ⵜⵙⵏⵏⵓⴼⴰⵜ, ⵓⵔ ⴷ ⴰⵢⵜ ⵜⵉⵢⵓⵔⵉ, ⵓⵍⴰ ⵉⵎⵓⴹⴹⴰⵕ.""" |
| example_3 = "ⴷⴰⴳ ⵓⵢⵍⵅ ⵙ ⵉⴳⵏⵏⴰ|ⵏⵏⴰⵏ ⵉⵢⵉ|ⴳⴳⵯⵣ ⴷ!|ⵏⵏⵉⵅ ⴰⵙⵏ|ⵜⵎⵢⴰⵔⵎ ⴰⵣⴷⴷⵉⵔ|ⵜⵎⵢⴰⵔⵎ ⵜⴰⵍⵍⴰ ⴷ ⵉⵎⵟⵟⴰⵡⵏ" |
| examples = gr.Examples( |
| examples=[ |
| ["common_voice_zgh_37837257.mp3", None, "ⵎⵍ ⵉⵢⵉ ⵎⴰⴷ ⴷ ⵜⴻⵜⵜⵎⵓⵏⴷ ⴰⴷ ⴰⴽ ⵎⵍⵖ ⵎⴰⴷ ⵜⴳⵉⴷ"], |
| ["Voice1410.wav", None, example_2], |
| ["Tamazight_For_All.mp3", None, example_3] |
| ], |
| inputs=[audio_file_in, subs_file_in, ref_text] |
| ) |
| demo.queue() |
| demo.launch() |
|
|