import gradio as gr import torch from transformers import pipeline, VitsModel, VitsTokenizer from PIL import Image import numpy as np import time # Load object detection pipeline obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50") # Initialize translation pipeline translator = pipeline(task="translation", model="facebook/nllb-200-distilled-600M", torch_dtype=torch.bfloat16) # Preload TTS components tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben") tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben") def translate_label(label_en): """Translate English text to Bengali using NLLB model""" try: result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng") return result[0]['translation_text'] except Exception as e: print(f"Translation error: {e}") return label_en # Fallback to English if translation fails def format_bangla_list(items): """Format a list of items in natural Bangla language""" if not items: return "" if len(items) == 1: return items[0] if len(items) == 2: return f"{items[0]} এবং {items[1]}" # For 3+ items: comma separate all but last, add 'এবং' before last return ", ".join(items[:-1]) + " এবং " + items[-1] def recognize_object(image: Image.Image): """Detect all objects, translate labels, and generate combined speech""" start_time = time.time() # Detect objects detections = obj_detector(image) print(f"Detection time: {time.time() - start_time:.2f}s") if not detections: return "কোনো বস্তু শনাক্ত হয়নি", None # Extract and translate labels labels_en = [d['label'] for d in detections] print("Detected objects:", labels_en) # Translate all labels to Bangla bangla_labels = [translate_label(label) for label in labels_en] print("Translated labels:", bangla_labels) # Format the list naturally in Bangla formatted_text = format_bangla_list(bangla_labels) # Add prefix based on number of objects num_objects = len(bangla_labels) if num_objects == 1: prefix = "এখানে একটি বস্তু আছে: " else: prefix = f"এখানে {num_objects}টি বস্তু আছে: " full_text = prefix + formatted_text print("Full Bangla text:", full_text) # Generate speech tts_start = time.time() inputs = tts_tokenizer(text=full_text, return_tensors="pt") with torch.no_grad(): output = tts_model(**inputs).waveform audio_data = (tts_model.config.sampling_rate, output.squeeze().numpy().astype(np.float32)) print(f"TTS time: {time.time() - tts_start:.2f}s") # Create detailed output text details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)]) details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details print(f"Total processing time: {time.time() - start_time:.2f}s") return details, audio_data with gr.Blocks(title="এটা কী?") as demo: gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)") gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।") img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"]) label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5) audio_out = gr.Audio(label="বাংলা অডিও", interactive=False) btn = gr.Button("বস্তু চিনুন") btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out]) demo.launch()