| |
| """app |
| |
| Automatically generated by Colab. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1hDCBaCrOX0FZx8VUT9_cUfWWg7y97yrx |
| """ |
|
|
| import gradio as gr |
| import os |
| import tempfile |
| import whisper |
| import re |
| from groq import Groq |
| from gtts import gTTS |
|
|
| |
| whisper_model = whisper.load_model("base") |
|
|
| |
| groq_client = Groq(api_key=os.getenv("GROQ_API_KEY")) |
|
|
| |
| SUPPORTED_LANGUAGES = [ |
| "English", "Chinese", "Thai", "Malay", "Korean", |
| "Japanese", "Spanish", "German", "Hindi", |
| "French", "Russian", "Tagalog", "Arabic", |
| "Myanmar", "Vietnamese" |
| ] |
|
|
| LANGUAGE_CODES = { |
| "English": "en", "Chinese": "zh", "Thai": "th", "Malay": "ms", "Korean": "ko", |
| "Japanese": "ja", "Spanish": "es", "German": "de", "Hindi": "hi", |
| "French": "fr", "Russian": "ru", "Tagalog": "tl", "Arabic": "ar", |
| "Myanmar": "my", "Vietnamese": "vi" |
| } |
|
|
| def transcribe_audio_locally(audio): |
| """Transcribe audio using local Whisper model""" |
| if audio is None: |
| return "" |
|
|
| try: |
| audio_path = audio["name"] if isinstance(audio, dict) and "name" in audio else audio |
| result = whisper_model.transcribe(audio_path) |
| return result["text"] |
| except Exception as e: |
| print(f"Error transcribing audio locally: {e}") |
| return f"Error transcribing audio: {str(e)}" |
|
|
| def translate_text(input_text, input_lang, output_langs): |
| """Translate text using Groq's API with improved prompt to avoid COT""" |
| if not input_text or not output_langs: |
| return [] |
|
|
| try: |
| |
| system_prompt = """You are a translation assistant that provides direct, accurate translations. |
| Do NOT include any thinking, reasoning, or explanations in your response. |
| Do NOT use phrases like 'In [language]:', 'Translation:' or similar prefixes. |
| Always respond with ONLY the exact translation text itself.""" |
|
|
| user_prompt = f"Translate this {input_lang} text: '{input_text}' into the following languages: {', '.join(output_langs)}. Provide each translation on a separate line with the language name as a prefix." |
|
|
| response = groq_client.chat.completions.create( |
| model="meta-llama/llama-4-maverick-17b-128e-instruct", |
| messages=[ |
| {"role": "system", "content": system_prompt}, |
| {"role": "user", "content": user_prompt} |
| ] |
| ) |
|
|
| translation_text = response.choices[0].message.content.strip() |
|
|
| |
| |
| translation_text = re.sub(r'<think>.*?</think>', '', translation_text, flags=re.DOTALL) |
|
|
| |
| thinking_patterns = [ |
| r'^\s*Let me think.*$', |
| r'^\s*I need to.*$', |
| r'^\s*First,.*$', |
| r'^\s*Okay, so.*$', |
| r'^\s*Hmm,.*$', |
| r'^\s*Let\'s break this down.*$' |
| ] |
|
|
| for pattern in thinking_patterns: |
| translation_text = re.sub(pattern, '', translation_text, flags=re.MULTILINE) |
|
|
| return translation_text |
| except Exception as e: |
| print(f"Error translating text: {e}") |
| return f"Error: {str(e)}" |
|
|
| def synthesize_speech(text, lang): |
| """Generate speech from text""" |
| if not text: |
| return None |
|
|
| try: |
| lang_code = LANGUAGE_CODES.get(lang, "en") |
| tts = gTTS(text=text, lang=lang_code) |
|
|
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp: |
| tts.save(fp.name) |
| return fp.name |
| except Exception as e: |
| print(f"Error synthesizing speech: {e}") |
| return None |
|
|
| def clear_memory(): |
| """Clear all fields""" |
| return "", "", "", "", None, None, None |
|
|
| def process_speech_to_text(audio): |
| """Process audio and return the transcribed text""" |
| if not audio: |
| return "" |
|
|
| transcribed_text = transcribe_audio_locally(audio) |
| return transcribed_text |
|
|
| def clean_translation_output(text): |
| """Clean translation output to remove any thinking or processing text""" |
| if not text: |
| return "" |
|
|
| |
| text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL) |
|
|
| |
| lines = text.split('\n') |
| cleaned_lines = [] |
|
|
| for line in lines: |
| |
| if re.search(r'(^I need to|^Let me|^First|^Okay|^Hmm|^I will|^I am thinking|^I should)', line, re.IGNORECASE): |
| continue |
|
|
| |
| if ':' in line and any(lang.lower() in line.lower() for lang in SUPPORTED_LANGUAGES): |
| cleaned_lines.append(line) |
| |
| elif line.strip() and not re.search(r'(thinking|translating|understand|process)', line, re.IGNORECASE): |
| cleaned_lines.append(line) |
|
|
| return '\n'.join(cleaned_lines) |
|
|
| def extract_translations(translations_text, output_langs): |
| """Extract clean translations from the model output""" |
| if not translations_text or not output_langs: |
| return [""] * 3 |
|
|
| |
| clean_text = clean_translation_output(translations_text) |
|
|
| |
| translation_results = [] |
|
|
| |
| for lang in output_langs: |
| pattern = rf'{lang}[\s]*:[\s]*(.*?)(?=\n\s*[A-Z]|$)' |
| match = re.search(pattern, clean_text, re.IGNORECASE | re.DOTALL) |
| if match: |
| translation_results.append(match.group(1).strip()) |
|
|
| |
| if not translation_results and '\n' in clean_text: |
| lines = [line.strip() for line in clean_text.split('\n') if line.strip()] |
|
|
| for line in lines: |
| |
| if ':' in line: |
| parts = line.split(':', 1) |
| if len(parts) == 2: |
| translation_results.append(parts[1].strip()) |
| else: |
| |
| translation_results.append(line) |
| elif not translation_results: |
| |
| translation_results.append(clean_text) |
|
|
| |
| while len(translation_results) < 3: |
| translation_results.append("") |
|
|
| return translation_results[:3] |
|
|
| def perform_translation(audio, typed_text, input_lang, output_langs): |
| """Main function to handle translation process""" |
| |
| if not output_langs: |
| return typed_text, "", "", "", None, None, None |
|
|
| |
| selected_langs = output_langs[:3] |
|
|
| |
| input_text = typed_text |
| if not input_text and audio: |
| input_text = transcribe_audio_locally(audio) |
|
|
| if not input_text: |
| return "", "", "", "", None, None, None |
|
|
| |
| translations_text = translate_text(input_text, input_lang, selected_langs) |
|
|
| |
| translation_results = extract_translations(translations_text, selected_langs) |
|
|
| |
| audio_paths = [] |
| for i, (trans, lang) in enumerate(zip(translation_results, selected_langs)): |
| if trans: |
| audio_path = synthesize_speech(trans, lang) |
| audio_paths.append(audio_path) |
| else: |
| audio_paths.append(None) |
|
|
| |
| while len(audio_paths) < 3: |
| audio_paths.append(None) |
|
|
| |
| return [input_text] + translation_results + audio_paths |
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("## 🌍 Multilingual Translator with Speech Support") |
|
|
| with gr.Row(): |
| input_lang = gr.Dropdown(choices=SUPPORTED_LANGUAGES, value="English", label="Input Language") |
| output_langs = gr.CheckboxGroup(choices=SUPPORTED_LANGUAGES, label="Output Languages (select up to 3)") |
|
|
| with gr.Row(): |
| audio_input = gr.Audio(type="filepath", label="Speak Your Input (upload or record)") |
| text_input = gr.Textbox(label="Or Type Text", elem_id="text_input") |
|
|
| transcribed_text = gr.Textbox(label="Transcribed Text (from audio)", interactive=False) |
| translated_outputs = [gr.Textbox(label=f"Translation {i+1}", interactive=False) for i in range(3)] |
| audio_outputs = [gr.Audio(label=f"Speech Output {i+1}") for i in range(3)] |
|
|
| with gr.Row(): |
| translate_btn = gr.Button("Translate", elem_id="translate_btn") |
| clear_btn = gr.Button("Clear Memory") |
|
|
| |
| def on_audio_change(audio): |
| if audio is None: |
| return "" |
| transcribed = process_speech_to_text(audio) |
| return transcribed |
|
|
| |
| audio_input.change( |
| on_audio_change, |
| inputs=[audio_input], |
| |
| outputs=[transcribed_text] |
| ) |
|
|
| |
| text_input.submit( |
| perform_translation, |
| inputs=[audio_input, text_input, input_lang, output_langs], |
| outputs=[transcribed_text] + translated_outputs + audio_outputs |
| ) |
|
|
| translate_btn.click( |
| perform_translation, |
| inputs=[audio_input, text_input, input_lang, output_langs], |
| outputs=[transcribed_text] + translated_outputs + audio_outputs |
| ) |
|
|
| clear_btn.click( |
| clear_memory, |
| inputs=[], |
| outputs=[transcribed_text] + translated_outputs + audio_outputs |
| ) |
|
|
| demo.launch() |