| import gradio as gr |
| from pathlib import Path |
| import time |
|
|
| import pandas as pd |
| import re |
| import time |
| import os |
| import requests |
| import json |
|
|
| import whisper |
| from pytube import YouTube |
| import psutil |
| import torch |
|
|
|
|
| |
|
|
|
|
| num_cores = psutil.cpu_count() |
| os.environ["OMP_NUM_THREADS"] = f"{num_cores}" |
| headers = {'Authorization': os.environ['DeepL_API_KEY']} |
|
|
| device = "cpu" |
| print("DEVICE IS: ") |
| print(device) |
|
|
| asr_model_base = whisper.load_model("base", device=device) |
| asr_model_small = whisper.load_model("small", device=device) |
| whisper_models_dict = { |
| 'base': asr_model_base, |
| 'small': asr_model_small |
| } |
|
|
| whisper_models = ["base", "small"] |
|
|
| transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) |
|
|
| source_languages = { |
| "Afrikaans":"af", |
| "Amharic":"am", |
| "Arabic":"ar", |
| "Asturian ":"st", |
| "Azerbaijani":"az", |
| "Bashkir":"ba", |
| "Belarusian":"be", |
| "Bulgarian":"bg", |
| "Bengali":"bn", |
| "Breton":"br", |
| "Bosnian":"bs", |
| "Catalan; Valencian":"ca", |
| "Cebuano":"eb", |
| "Czech":"cs", |
| "Welsh":"cy", |
| "Danish":"da", |
| "German":"de", |
| "Greeek":"el", |
| "English":"en", |
| "Spanish":"es", |
| "Estonian":"et", |
| "Persian":"fa", |
| "Fulah":"ff", |
| "Finnish":"fi", |
| "French":"fr", |
| "Western Frisian":"fy", |
| "Irish":"ga", |
| "Gaelic; Scottish Gaelic":"gd", |
| "Galician":"gl", |
| "Gujarati":"gu", |
| "Hausa":"ha", |
| "Hebrew":"he", |
| "Hindi":"hi", |
| "Croatian":"hr", |
| "Haitian; Haitian Creole":"ht", |
| "Hungarian":"hu", |
| "Armenian":"hy", |
| "Indonesian":"id", |
| "Igbo":"ig", |
| "Iloko":"lo", |
| "Icelandic":"is", |
| "Italian":"it", |
| "Japanese":"ja", |
| "Javanese":"jv", |
| "Georgian":"ka", |
| "Kazakh":"kk", |
| "Central Khmer":"km", |
| "Kannada":"kn", |
| "Korean":"ko", |
| "Luxembourgish; Letzeburgesch":"lb", |
| "Ganda":"lg", |
| "Lingala":"ln", |
| "Lao":"lo", |
| "Lithuanian":"lt", |
| "Latvian":"lv", |
| "Malagasy":"mg", |
| "Macedonian":"mk", |
| "Malayalam":"ml", |
| "Mongolian":"mn", |
| "Marathi":"mr", |
| "Malay":"ms", |
| "Burmese":"my", |
| "Nepali":"ne", |
| "Dutch; Flemish":"nl", |
| "Norwegian":"no", |
| "Northern Sotho":"ns", |
| "Occitan (post 1500)":"oc", |
| "Oriya":"or", |
| "Panjabi; Punjabi":"pa", |
| "Polish":"pl", |
| "Pushto; Pashto":"ps", |
| "Portuguese":"pt", |
| "Romanian; Moldavian; Moldovan":"ro", |
| "Russian":"ru", |
| "Sindhi":"sd", |
| "Sinhala; Sinhalese":"si", |
| "Slovak":"sk", |
| "Slovenian":"sl", |
| "Somali":"so", |
| "Albanian":"sq", |
| "Serbian":"sr", |
| "Swati":"ss", |
| "Sundanese":"su", |
| "Swedish":"sv", |
| "Swahili":"sw", |
| "Tamil":"ta", |
| "Thai":"th", |
| "Tagalog":"tl", |
| "Tswana":"tn", |
| "Turkish":"tr", |
| "Ukrainian":"uk", |
| "Urdu":"ur", |
| "Uzbek":"uz", |
| "Vietnamese":"vi", |
| "Wolof":"wo", |
| "Xhosa":"xh", |
| "Yiddish":"yi", |
| "Yoruba":"yo", |
| "Chinese":"zh", |
| "Zulu":"zu", |
| "Let the model analyze": "Let the model analyze" |
| } |
|
|
| DeepL_language_codes_for_translation = { |
| "Bulgarian": "BG", |
| "Czech": "CS", |
| "Danish": "DA", |
| "German": "DE", |
| "Greek": "EL", |
| "English": "EN", |
| "Spanish": "ES", |
| "Estonian": "ET", |
| "Finnish": "FI", |
| "French": "FR", |
| "Hungarian": "HU", |
| "Indonesian": "ID", |
| "Italian": "IT", |
| "Japanese": "JA", |
| "Lithuanian": "LT", |
| "Latvian": "LV", |
| "Dutch": "NL", |
| "Polish": "PL", |
| "Portuguese": "PT", |
| "Romanian": "RO", |
| "Russian": "RU", |
| "Slovak": "SK", |
| "Slovenian": "SL", |
| "Swedish": "SV", |
| "Turkish": "TR", |
| "Ukrainian": "UK", |
| "Chinese": "ZH" |
| } |
|
|
|
|
| source_language_list = [key[0] for key in source_languages.items()] |
| translation_models_list = [key[0] for key in DeepL_language_codes_for_translation.items()] |
|
|
| |
| videos_out_path = Path("./videos_out") |
| videos_out_path.mkdir(parents=True, exist_ok=True) |
|
|
| def get_youtube(video_url): |
| yt = YouTube(video_url) |
| abs_video_path = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first().download() |
| print("LADATATTU POLKUUN") |
| print(abs_video_path) |
| |
| return abs_video_path |
|
|
| def speech_to_text(video_file_path, selected_translation_lang, whisper_model): |
| """ |
| # Youtube with translated subtitles using OpenAI Whisper and Opus-MT models. |
| # Currently supports only English audio |
| This space allows you to: |
| 1. Download youtube video with a given url |
| 2. Watch it in the first video component |
| 3. Run automatic speech recognition on the video using fast Whisper models |
| 4. Translate the recognized transcriptions to 26 languages supported by deepL (If source language not supported this will return original transciption) |
| 5. Burn the translations to the original video and watch the video in the 2nd video component |
| |
| Speech Recognition is based on models from OpenAI Whisper https://github.com/openai/whisper |
| This space is using c++ implementation by https://github.com/ggerganov/whisper.cpp |
| """ |
| |
| if(video_file_path == None): |
| raise ValueError("Error no video input") |
| print(video_file_path) |
| try: |
| audio = whisper.load_audio(video_file_path) |
| except Exception as e: |
| raise RuntimeError("Error converting video to audio") |
|
|
| last_time = time.time() |
|
|
| try: |
| print(f'Transcribing via local model') |
| |
| transcribe_options = dict(beam_size=3, best_of=3, without_timestamps=False) |
| transcription = whisper_models_dict.get(whisper_model).transcribe(audio, **transcribe_options) |
| |
| df = pd.DataFrame(columns=['start','end','text']) |
|
|
| |
|
|
| for i,segment in enumerate(transcription['segments']): |
| new_row = {'start': segment['start'], |
| 'end': segment['end'], |
| 'text': segment['text'] |
| } |
| df = df.append(new_row, ignore_index=True) |
| |
| return (df) |
| except Exception as e: |
| raise RuntimeError("Error Running inference with local model", e) |
|
|
|
|
|
|
| def translate_transcriptions(df, selected_translation_lang_2): |
| if selected_translation_lang_2 is None: |
| selected_translation_lang_2 = 'English' |
| df.reset_index(inplace=True) |
| |
| print("start_translation") |
| translations = [] |
| |
| |
|
|
| text_combined = "" |
| for i, sentence in enumerate(df['text']): |
| if i == 0: |
| text_combined = sentence |
| else: |
| text_combined = text_combined + '\n' + sentence |
|
|
| data = {'text': text_combined, |
| 'tag_spitting': 'xml', |
| 'target_lang': DeepL_language_codes_for_translation.get(selected_translation_lang_2) |
| } |
| try: |
| response = requests.post('https://api-free.deepl.com/v2/translate', headers=headers, data=data) |
| |
| |
| translated_sentences = json.loads(response.text) |
| translated_sentences = translated_sentences['translations'][0]['text'].split('\n') |
| df['translation'] = translated_sentences |
| except Exception as e: |
| print(e) |
| df['translation'] = df['text'] |
| |
| print("translations done") |
|
|
| return df |
|
|
| def create_srt_and_burn(df, video_in): |
| |
| print("Starting creation of video wit srt") |
| |
| |
| with open('testi.srt','w', encoding="utf-8") as file: |
| for i in range(len(df)): |
| file.write(str(i+1)) |
| file.write('\n') |
| start = df.iloc[i]['start'] |
| |
| |
| milliseconds = round(start * 1000.0) |
|
|
| hours = milliseconds // 3_600_000 |
| milliseconds -= hours * 3_600_000 |
|
|
| minutes = milliseconds // 60_000 |
| milliseconds -= minutes * 60_000 |
|
|
| seconds = milliseconds // 1_000 |
| milliseconds -= seconds * 1_000 |
|
|
| file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") |
| |
| stop = df.iloc[i]['end'] |
| |
| |
| milliseconds = round(stop * 1000.0) |
|
|
| hours = milliseconds // 3_600_000 |
| milliseconds -= hours * 3_600_000 |
|
|
| minutes = milliseconds // 60_000 |
| milliseconds -= minutes * 60_000 |
|
|
| seconds = milliseconds // 1_000 |
| milliseconds -= seconds * 1_000 |
|
|
| |
| file.write(' --> ') |
| file.write(f"{hours}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}") |
| file.write('\n') |
| file.writelines(df.iloc[i]['translation']) |
| if int(i) != len(df)-1: |
| file.write('\n\n') |
| |
| print("SRT DONE") |
| try: |
| file1 = open('./testi.srt', 'r', encoding="utf-8") |
| Lines = file1.readlines() |
| |
| count = 0 |
| |
| for line in Lines: |
| count += 1 |
| print("{}".format(line)) |
| |
| print(type(video_in)) |
| print(video_in) |
| |
| video_out = video_in.replace('.mp4', '_out.mp4') |
| print(video_out) |
| command = 'ffmpeg -i "{}" -y -vf subtitles=./testi.srt "{}"'.format(video_in, video_out) |
| print(command) |
| os.system(command) |
| return video_out |
| except Exception as e: |
| print(e) |
| return video_out |
|
|
|
|
| |
| video_in = gr.Video(label="Video file", mirror_webcam=False) |
| youtube_url_in = gr.Textbox(label="Youtube url", lines=1, interactive=True) |
| video_out = gr.Video(label="Video Out", mirror_webcam=False) |
|
|
|
|
| df_init = pd.DataFrame(columns=['start','end','text','translation']) |
| selected_source_lang = gr.Dropdown(choices=source_language_list, type="value", value="Let the model analyze", label="Spoken language in video", interactive=True) |
| selected_translation_lang_2 = gr.Dropdown(choices=translation_models_list, type="value", value="English", label="In which language you want the transcriptions?", interactive=True) |
| selected_whisper_model = gr.Dropdown(choices=whisper_models, type="value", value="base", label="Selected Whisper model", interactive=True) |
|
|
| transcription_df = gr.DataFrame(value=df_init,label="Transcription dataframe", row_count=(0, "dynamic"), max_rows = 10, wrap=True, overflow_row_behaviour='paginate') |
| transcription_and_translation_df = gr.DataFrame(value=df_init,label="Transcription and translation dataframe", max_rows = 10, wrap=True, overflow_row_behaviour='paginate') |
|
|
| demo = gr.Blocks(css=''' |
| #cut_btn, #reset_btn { align-self:stretch; } |
| #\\31 3 { max-width: 540px; } |
| .output-markdown {max-width: 65ch !important;} |
| ''') |
| demo.encrypt = False |
| with demo: |
| transcription_var = gr.Variable() |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown(''' |
| ### This space allows you to: |
| ##### 1. Download youtube video with a given URL |
| ##### 2. Watch it in the first video component |
| ##### 3. Run automatic speech recognition on the video using Whisper |
| ##### 4. Translate the recognized transcriptions to 26 languages supported by deepL |
| ##### 5. Burn the translations to the original video and watch the video in the 2nd video component |
| ''') |
| |
| with gr.Column(): |
| gr.Markdown(''' |
| ### 1. Insert Youtube URL below. Some test videos below: |
| ##### 1. https://www.youtube.com/watch?v=nlMuHtV82q8&ab_channel=NothingforSale24 |
| ##### 2. https://www.youtube.com/watch?v=JzPfMbG1vrE&ab_channel=ExplainerVideosByLauren |
| ##### 3. https://www.youtube.com/watch?v=S68vvV0kod8&ab_channel=Pearl-CohnTelevision |
| ''') |
| |
| with gr.Row(): |
| with gr.Column(): |
| youtube_url_in.render() |
| download_youtube_btn = gr.Button("Step 1. Download Youtube video") |
| download_youtube_btn.click(get_youtube, [youtube_url_in], [ |
| video_in]) |
| print(video_in) |
| |
|
|
| with gr.Row(): |
| with gr.Column(): |
| video_in.render() |
| with gr.Column(): |
| gr.Markdown(''' |
| ##### Here you can start the transcription and translation process. |
| ##### Be aware that processing will last some time. With base model it is around 3x speed |
| ''') |
| selected_source_lang.render() |
| selected_whisper_model.render() |
| transcribe_btn = gr.Button("Step 2. Transcribe audio") |
| transcribe_btn.click(speech_to_text, [video_in, selected_source_lang, selected_whisper_model], transcription_df) |
|
|
| |
| with gr.Row(): |
| gr.Markdown(''' |
| ##### Here you will get transcription output |
| ##### ''') |
|
|
| with gr.Row(): |
| with gr.Column(): |
| transcription_df.render() |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown(''' |
| ##### Here you will get translated transcriptions. |
| ##### Please remember to select target language |
| ##### ''') |
| selected_translation_lang_2.render() |
| translate_transcriptions_button = gr.Button("Step 3. Translate transcription") |
| translate_transcriptions_button.click(translate_transcriptions, [transcription_df, selected_translation_lang_2], transcription_and_translation_df) |
| transcription_and_translation_df.render() |
| |
| with gr.Row(): |
| with gr.Column(): |
| gr.Markdown(''' |
| ##### Now press the Step 4. Button to create output video with translated transcriptions |
| ##### ''') |
| translate_and_make_srt_btn = gr.Button("Step 4. Create and burn srt to video") |
| print(video_in) |
| translate_and_make_srt_btn.click(create_srt_and_burn, [transcription_and_translation_df,video_in], [ |
| video_out]) |
| video_out.render() |
|
|
| |
| demo.launch() |