Spaces:
Sleeping
Sleeping
File size: 3,923 Bytes
5ffa482 64a7f54 826741b 94bed02 cf95b7a 826741b 94bed02 826741b 94bed02 64a7f54 94bed02 cf95b7a 826741b 64a7f54 cf95b7a 826741b cf95b7a 64a7f54 cf95b7a 64a7f54 826741b cf95b7a 826741b cf95b7a 94bed02 64a7f54 cf95b7a 64a7f54 cf95b7a 94bed02 cf95b7a 826741b cf95b7a 826741b cf95b7a 826741b cf95b7a 2faa641 826741b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 | import gradio as gr
import torch
from transformers import pipeline, VitsModel, VitsTokenizer
from PIL import Image
import numpy as np
import time
# Load object detection pipeline
obj_detector = pipeline("object-detection", model="facebook/detr-resnet-50")
# Initialize translation pipeline
translator = pipeline(task="translation",
model="facebook/nllb-200-distilled-600M",
torch_dtype=torch.bfloat16)
# Preload TTS components
tts_model = VitsModel.from_pretrained("facebook/mms-tts-ben")
tts_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-ben")
def translate_label(label_en):
"""Translate English text to Bengali using NLLB model"""
try:
result = translator(label_en, src_lang="eng_Latn", tgt_lang="ben_Beng")
return result[0]['translation_text']
except Exception as e:
print(f"Translation error: {e}")
return label_en # Fallback to English if translation fails
def format_bangla_list(items):
"""Format a list of items in natural Bangla language"""
if not items:
return ""
if len(items) == 1:
return items[0]
if len(items) == 2:
return f"{items[0]} এবং {items[1]}"
# For 3+ items: comma separate all but last, add 'এবং' before last
return ", ".join(items[:-1]) + " এবং " + items[-1]
def recognize_object(image: Image.Image):
"""Detect all objects, translate labels, and generate combined speech"""
start_time = time.time()
# Detect objects
detections = obj_detector(image)
print(f"Detection time: {time.time() - start_time:.2f}s")
if not detections:
return "কোনো বস্তু শনাক্ত হয়নি", None
# Extract and translate labels
labels_en = [d['label'] for d in detections]
print("Detected objects:", labels_en)
# Translate all labels to Bangla
bangla_labels = [translate_label(label) for label in labels_en]
print("Translated labels:", bangla_labels)
# Format the list naturally in Bangla
formatted_text = format_bangla_list(bangla_labels)
# Add prefix based on number of objects
num_objects = len(bangla_labels)
if num_objects == 1:
prefix = "এখানে একটি বস্তু আছে: "
else:
prefix = f"এখানে {num_objects}টি বস্তু আছে: "
full_text = prefix + formatted_text
print("Full Bangla text:", full_text)
# Generate speech
tts_start = time.time()
inputs = tts_tokenizer(text=full_text, return_tensors="pt")
with torch.no_grad():
output = tts_model(**inputs).waveform
audio_data = (tts_model.config.sampling_rate,
output.squeeze().numpy().astype(np.float32))
print(f"TTS time: {time.time() - tts_start:.2f}s")
# Create detailed output text
details = "\n".join([f"{en} → {bn}" for en, bn in zip(labels_en, bangla_labels)])
details = f"শনাক্তকৃত বস্তু ({num_objects}টি):\n" + details
print(f"Total processing time: {time.time() - start_time:.2f}s")
return details, audio_data
with gr.Blocks(title="এটা কী?") as demo:
gr.Markdown("## 🧠 এটা কী? (Bangla Object Identifier)")
gr.Markdown("ছবি দিন বা তুলুন—সমস্ত বস্তুর বাংলা নাম ও অডিও শুনুন।")
img_input = gr.Image(label="ছবি দিন বা তুলুন", type="pil", sources=["upload","webcam"])
label_out = gr.Textbox(label="শনাক্তকৃত বস্তু", lines=5)
audio_out = gr.Audio(label="বাংলা অডিও", interactive=False)
btn = gr.Button("বস্তু চিনুন")
btn.click(recognize_object, inputs=img_input, outputs=[label_out, audio_out])
demo.launch() |