Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import pipeline, VitsModel, VitsTokenizer | |
| from PIL import Image | |
| import numpy as np | |
| import time | |
| # Load object detection pipeline | |
| obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") | |
| # Initialize translation pipeline | |
| translator = pipeline(task="translation", | |
| model="facebook/nllb-200-distilled-600M", | |
| torch_dtype=torch.bfloat16) | |
| # Preload TTS components | |
| tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben") | |
| tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben") | |
| def translate_label(label_en): | |
| """Translate English text to Bengali using NLLB model""" | |
| try: | |
| result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng") | |
| return result[0]['translation_text'] | |
| except Exception as e: | |
| print(f"Translation error: {e}") | |
| return label_en # Fallback to English if translation fails | |
| def format_bangla_list(items): | |
| """Format a list of items in natural Bangla language""" | |
| if not items: | |
| return "" | |
| if len(items) == 1: | |
| return items[0] | |
| if len(items) == 2: | |
| return f"{items[0]} এবং {items[1]}" | |
| # For 3+ items: comma separate all but last, add 'এবং' before last | |
| return ", ".join(items[:-1]) + " এবং " + items[-1] | |
| def recognize_object(image: Image.Image): | |
| """Detect all objects, translate labels, and generate combined speech""" | |
| start_time = time.time() | |
| # Detect objects | |
| detections = obj_detector(image) | |
| print(f"Detection time: {time.time() - start_time:.2f}s") | |
| if not detections: | |
| return "কোনো বস্তু শনাক্ত হয়নি", None | |
| # Extract and translate labels | |
| labels_en = [d['label'] for d in detections] | |
| print("Detected objects:", labels_en) | |
| # Translate all labels to Bangla | |
| bangla_labels = [translate_label(label) for label in labels_en] | |
| print("Translated labels:", bangla_labels) | |
| # Format the list naturally in Bangla | |
| formatted_text = format_bangla_list(bangla_labels) | |
| # Add prefix based on number of objects | |
| num_objects = len(bangla_labels) | |
| if num_objects == 1: | |
| prefix = "এখানে একটি বস্তু আছে: " | |
| else: | |
| prefix = f"এখানে {num_objects}টি বস্তু আছে: " | |
| full_text = prefix + formatted_text | |
| print("Full Bangla text:", full_text) | |
| # Generate speech | |
| tts_start = time.time() | |
| inputs = tts_tokenizer(text=full_text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = tts_model(**inputs).waveform | |
| audio_data = (tts_model.config.sampling_rate, | |
| output.squeeze().numpy().astype(np.float32)) | |
| print(f"TTS time: {time.time() - tts_start:.2f}s") | |
| # Create detailed output text | |
| details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)]) | |
| details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details | |
| print(f"Total processing time: {time.time() - start_time:.2f}s") | |
| return details, audio_data | |
| with gr.Blocks(title="এটা কী?") as demo: | |
| gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)") | |
| gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।") | |
| img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"]) | |
| label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5) | |
| audio_out = gr.Audio(label="বাংলা অডিও", interactive=False) | |
| btn = gr.Button("বস্তু চিনুন") | |
| btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out]) | |
| demo.launch() |